From 6d62d88d4f3899bbf9009547a7d731eed291b244 Mon Sep 17 00:00:00 2001
From: Philippe Tillet <phil@openai.com>
Date: Tue, 26 Jul 2022 17:25:03 -0700
Subject: [PATCH] [CI] run clang-format (#24)

---
 .github/workflows/integration-tests.yml       |    8 +-
 bin/FileCheck/FileCheck.cpp                   |   17 +-
 bin/triton-opt.cpp                            |   20 +-
 include/triton/Analysis/AxisInfo.h            |   67 +-
 include/triton/Conversion/Passes.h            |    8 +-
 .../TritonToTritonGPU/TritonToTritonGPU.h     |    7 +-
 include/triton/Dialect/Triton/IR/Dialect.h    |    9 +-
 include/triton/Dialect/Triton/IR/Traits.h     |   14 +-
 .../triton/Dialect/Triton/Transforms/Passes.h |    2 +-
 include/triton/Dialect/TritonGPU/IR/Dialect.h |    1 -
 .../Transforms/TritonGPUConversion.h          |    5 +-
 include/triton/driver/dispatch.h              |  387 +-
 include/triton/driver/error.h                 |  415 +-
 include/triton/driver/llvm.h                  |   25 +-
 include/triton/tools/bench.hpp                |   47 +-
 include/triton/tools/graph.h                  |   27 +-
 include/triton/tools/sha1.hpp                 |  248 +-
 include/triton/tools/sys/exec.hpp             |   18 +-
 include/triton/tools/sys/getenv.hpp           |   27 +-
 include/triton/tools/sys/mkdir.hpp            |   74 +-
 include/triton/tools/thread_pool.h            |  121 +-
 lib/Analysis/AxisInfo.cpp                     |  138 +-
 lib/Conversion/PassDetail.h                   |   10 +-
 .../TritonToTritonGPU/TritonToTritonGPU.cpp   |  265 +-
 lib/Dialect/Triton/IR/Dialect.cpp             |    8 +-
 lib/Dialect/Triton/IR/Ops.cpp                 |   58 +-
 lib/Dialect/Triton/IR/Types.cpp               |    4 +-
 lib/Dialect/Triton/Transforms/Combine.cpp     |   61 +-
 lib/Dialect/TritonGPU/IR/Dialect.cpp          |  112 +-
 lib/Dialect/TritonGPU/Transforms/Combine.cpp  |    4 +-
 lib/Dialect/TritonGPU/Transforms/Pipeline.cpp |  126 +-
 .../Transforms/TritonGPUConversion.cpp        |  109 +-
 lib/Dialect/TritonGPU/Transforms/Verifier.cpp |   17 +-
 lib/driver/dispatch.cc                        |  408 +-
 lib/driver/error.cc                           |  410 +-
 lib/driver/llvm.cc                            |  248 +-
 python/src/pybind11/attr.h                    |  610 +--
 python/src/pybind11/buffer_info.h             |  165 +-
 python/src/pybind11/cast.h                    | 3420 ++++++++-------
 python/src/pybind11/chrono.h                  |  241 +-
 python/src/pybind11/common.h                  |    3 +-
 python/src/pybind11/complex.h                 |   59 +-
 python/src/pybind11/detail/class.h            |  861 ++--
 python/src/pybind11/detail/common.h           | 1012 +++--
 python/src/pybind11/detail/descr.h            |   96 +-
 python/src/pybind11/detail/init.h             |  502 ++-
 python/src/pybind11/detail/internals.h        |  445 +-
 python/src/pybind11/detail/typeid.h           |   35 +-
 python/src/pybind11/eigen.h                   | 1044 +++--
 python/src/pybind11/embed.h                   |  182 +-
 python/src/pybind11/eval.h                    |  149 +-
 python/src/pybind11/functional.h              |  154 +-
 python/src/pybind11/iostream.h                |  168 +-
 python/src/pybind11/numpy.h                   | 2616 ++++++------
 python/src/pybind11/operators.h               |  284 +-
 python/src/pybind11/options.h                 |   74 +-
 python/src/pybind11/pybind11.h                | 3714 +++++++++--------
 python/src/pybind11/pytypes.h                 | 2154 +++++-----
 python/src/pybind11/stl.h                     |  546 +--
 python/src/pybind11/stl_bind.h                |  911 ++--
 python/src/triton.cc                          | 2018 +++++----
 test/lib/Analysis/TestAxisInfo.cpp            |   52 +-
 62 files changed, 13673 insertions(+), 11367 deletions(-)
 mode change 100755 => 100644 include/triton/driver/dispatch.h
 mode change 100755 => 100644 include/triton/driver/error.h
 mode change 100755 => 100644 include/triton/tools/sys/getenv.hpp
 mode change 100755 => 100644 include/triton/tools/sys/mkdir.hpp
 mode change 100755 => 100644 lib/driver/dispatch.cc
 mode change 100755 => 100644 lib/driver/error.cc

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index 89ac8f403..cabbdba90 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -27,10 +27,16 @@ jobs:
           pip install isort
           isort -c ./python || ( echo '::error title=Imports not sorted::Please run \"isort ./python\"' ; exit 1 )
 
-      - name: Check style
+      - name: Check python style
         run: |
           pip install autopep8
           autopep8 -a -r -d --exit-code ./python || ( echo '::error title=Style issues::Please run \"autopep8 -a -r -i ./python\"' ; exit 1 )
+        
+      - name: Check cpp style
+        run: |
+          sudo apt-get install clang-format
+          find . -regex '.*\.\(cpp\|hpp\|h\|cc\)' -not -path "./python/build/*" -not -path "./include/triton/external/*" -print0 | xargs -0 -n1 clang-format -style=file --dry-run -Werror -i ||
+          (echo '::error title=Style issues:: Please run `find . -regex ".*\.\(cpp\|hpp\|h\|cc\)" -not -path "./python/build/*" -not -path "./include/triton/external/*" -print0 | xargs -0 -n1 clang-format -style=file -i`' ; exit 1)
 
       - name: Flake8
         run: |
diff --git a/bin/FileCheck/FileCheck.cpp b/bin/FileCheck/FileCheck.cpp
index 6742853c9..819efc354 100644
--- a/bin/FileCheck/FileCheck.cpp
+++ b/bin/FileCheck/FileCheck.cpp
@@ -57,9 +57,8 @@ static cl::opt<bool> NoCanonicalizeWhiteSpace(
     "strict-whitespace",
     cl::desc("Do not treat all horizontal whitespace as equivalent"));
 
-static cl::opt<bool> IgnoreCase(
-    "ignore-case",
-    cl::desc("Use case-insensitive matching"));
+static cl::opt<bool> IgnoreCase("ignore-case",
+                                cl::desc("Use case-insensitive matching"));
 
 static cl::list<std::string> ImplicitCheckNot(
     "implicit-check-not",
@@ -169,12 +168,6 @@ static cl::list<unsigned> DumpInputContexts(
 
 typedef cl::list<std::string>::const_iterator prefix_iterator;
 
-
-
-
-
-
-
 static void DumpCommandLine(int argc, char **argv) {
   errs() << "FileCheck command line: ";
   for (int I = 0; I < argc; I++)
@@ -613,8 +606,7 @@ static void DumpAnnotatedInput(raw_ostream &OS, const FileCheckRequest &Req,
     ElidedLinesOS.enable_colors(true);
   auto AnnotationItr = Annotations.begin(), AnnotationEnd = Annotations.end();
   for (unsigned Line = 1;
-       InputFilePtr != InputFileEnd || AnnotationItr != AnnotationEnd;
-       ++Line) {
+       InputFilePtr != InputFileEnd || AnnotationItr != AnnotationEnd; ++Line) {
     const unsigned char *InputFileLine = InputFilePtr;
 
     // Compute the previous and next line included by the filter.
@@ -691,8 +683,7 @@ static void DumpAnnotatedInput(raw_ostream &OS, const FileCheckRequest &Req,
     unsigned InputLineWidth = InputFilePtr - InputFileLine;
 
     // Print any annotations.
-    while (AnnotationItr != AnnotationEnd &&
-           AnnotationItr->InputLine == Line) {
+    while (AnnotationItr != AnnotationEnd && AnnotationItr->InputLine == Line) {
       WithColor COS(*LineOS, AnnotationItr->Marker.Color, /*Bold=*/true,
                     /*BG=*/false, TheColorMode);
       // The two spaces below are where the ": " appears on input lines.
diff --git a/bin/triton-opt.cpp b/bin/triton-opt.cpp
index d5d73e5f6..4942214cc 100644
--- a/bin/triton-opt.cpp
+++ b/bin/triton-opt.cpp
@@ -10,11 +10,11 @@
 #include "mlir/InitAllPasses.h"
 #include "mlir/Support/MlirOptMain.h"
 
-namespace mlir{
-namespace test{
+namespace mlir {
+namespace test {
 void registerTestAlignmentPass();
 }
-}
+} // namespace mlir
 
 int main(int argc, char **argv) {
   mlir::registerAllPasses();
@@ -25,13 +25,11 @@ int main(int argc, char **argv) {
 
   // TODO: register Triton & TritonGPU passes
   mlir::DialectRegistry registry;
-  registry.insert<mlir::triton::TritonDialect,
-                  mlir::triton::gpu::TritonGPUDialect,
-                  mlir::arith::ArithmeticDialect,
-                  mlir::StandardOpsDialect,
-                  mlir::scf::SCFDialect>();
+  registry
+      .insert<mlir::triton::TritonDialect, mlir::triton::gpu::TritonGPUDialect,
+              mlir::arith::ArithmeticDialect, mlir::StandardOpsDialect,
+              mlir::scf::SCFDialect>();
 
-  return mlir::asMainReturnCode(
-      mlir::MlirOptMain(argc, argv, "Triton (GPU) optimizer driver\n", registry)
-  );
+  return mlir::asMainReturnCode(mlir::MlirOptMain(
+      argc, argv, "Triton (GPU) optimizer driver\n", registry));
 }
diff --git a/include/triton/Analysis/AxisInfo.h b/include/triton/Analysis/AxisInfo.h
index 0910f341f..c9be250fc 100644
--- a/include/triton/Analysis/AxisInfo.h
+++ b/include/triton/Analysis/AxisInfo.h
@@ -10,7 +10,6 @@
 
 namespace mlir {
 
-
 //===----------------------------------------------------------------------===//
 // AxisInfo
 //===----------------------------------------------------------------------===//
@@ -25,26 +24,25 @@ public:
 
 public:
   // Default constructor
-  AxisInfo(): AxisInfo({}, {}, {}) { }
+  AxisInfo() : AxisInfo({}, {}, {}) {}
   // Construct contiguity info with known contiguity
   AxisInfo(ContiguityT knownContiguity, DivisibilityT knownDivisibility,
            ConstancyT knownConstancy)
-    : contiguity(knownContiguity), divisibility(knownDivisibility), 
-      constancy(knownConstancy), rank(contiguity.size()) { 
-      assert(knownDivisibility.size() == rank);
-      assert(knownConstancy.size() == rank);
-    }
-  
-  
+      : contiguity(knownContiguity), divisibility(knownDivisibility),
+        constancy(knownConstancy), rank(contiguity.size()) {
+    assert(knownDivisibility.size() == rank);
+    assert(knownConstancy.size() == rank);
+  }
+
   // Accessors
-  int getContiguity(size_t d) const { return contiguity[d];   }
-  const ContiguityT& getContiguity() const { return contiguity; }
+  int getContiguity(size_t d) const { return contiguity[d]; }
+  const ContiguityT &getContiguity() const { return contiguity; }
 
   int getDivisibility(size_t d) const { return divisibility[d]; }
-  const DivisibilityT& getDivisibility() const { return divisibility; }
+  const DivisibilityT &getDivisibility() const { return divisibility; }
 
-  int getConstancy(size_t d) const { return constancy[d];    }
-  const ConstancyT& getConstancy() const { return constancy; }
+  int getConstancy(size_t d) const { return constancy[d]; }
+  const ConstancyT &getConstancy() const { return constancy; }
 
   int getRank() const { return rank; }
 
@@ -56,13 +54,13 @@ public:
   }
 
   /// The pessimistic value state of the contiguity is unknown.
-  static AxisInfo getPessimisticValueState(MLIRContext *context) 
-  { return AxisInfo(); }
+  static AxisInfo getPessimisticValueState(MLIRContext *context) {
+    return AxisInfo();
+  }
   static AxisInfo getPessimisticValueState(Value value);
 
   // The gcd of both arguments for each dimension
-  static AxisInfo join(const AxisInfo &lhs,
-                       const AxisInfo &rhs);
+  static AxisInfo join(const AxisInfo &lhs, const AxisInfo &rhs);
 
 private:
   /// The _contiguity_ information maps the `d`-th
@@ -81,7 +79,7 @@ private:
   /// [19, 23, 27, 31]
   /// Would have contiguity [2, 1].
   ContiguityT contiguity;
-  
+
   /// The _divisibility_ information maps the `d`-th
   /// dimension to the largest power-of-two that
   /// divides the first element of all the values along it
@@ -107,39 +105,36 @@ private:
   /// [16, 16, 16, 16, 20, 20, 20, 20]
   /// would have constancy [1, 4]
   ConstancyT constancy;
-  
+
   // number of dimensions of the lattice
   int rank;
 };
 
-
-class AxisInfoAnalysis
-    : public ForwardDataFlowAnalysis<AxisInfo> {
+class AxisInfoAnalysis : public ForwardDataFlowAnalysis<AxisInfo> {
 
 private:
   static const int maxPow2Divisor = 65536;
- 
-  int highestPowOf2Divisor(int n){
-    if(n==0)
+
+  int highestPowOf2Divisor(int n) {
+    if (n == 0)
       return maxPow2Divisor;
     return (n & (~(n - 1)));
   }
 
-  AxisInfo visitBinaryOp(Operation* op, AxisInfo lhsInfo, AxisInfo rhsInfo,
-                         const std::function<int(AxisInfo,AxisInfo,int)>& getContiguity,
-                         const std::function<int(AxisInfo,AxisInfo,int)>& getDivisibility,
-                         const std::function<int(AxisInfo,AxisInfo,int)>& getConstancy);
+  AxisInfo visitBinaryOp(
+      Operation *op, AxisInfo lhsInfo, AxisInfo rhsInfo,
+      const std::function<int(AxisInfo, AxisInfo, int)> &getContiguity,
+      const std::function<int(AxisInfo, AxisInfo, int)> &getDivisibility,
+      const std::function<int(AxisInfo, AxisInfo, int)> &getConstancy);
 
 public:
   using ForwardDataFlowAnalysis<AxisInfo>::ForwardDataFlowAnalysis;
 
-  ChangeResult visitOperation(Operation *op,
-                      ArrayRef<LatticeElement<AxisInfo> *> operands) override;
-
+  ChangeResult
+  visitOperation(Operation *op,
+                 ArrayRef<LatticeElement<AxisInfo> *> operands) override;
 };
 
-
-}
- 
+} // namespace mlir
 
 #endif
\ No newline at end of file
diff --git a/include/triton/Conversion/Passes.h b/include/triton/Conversion/Passes.h
index 125551f5c..8cf53bc1c 100644
--- a/include/triton/Conversion/Passes.h
+++ b/include/triton/Conversion/Passes.h
@@ -3,17 +3,13 @@
 
 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h"
 
-namespace mlir
-{
-namespace triton
-{
+namespace mlir {
+namespace triton {
 
 #define GEN_PASS_REGISTRATION
 #include "triton/Conversion/Passes.h.inc"
 
-
 } // namespace triton
 } // namespace mlir
 
-
 #endif
\ No newline at end of file
diff --git a/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h b/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h
index b21b6a1f1..bdb058249 100644
--- a/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h
+++ b/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h
@@ -3,18 +3,17 @@
 
 #include <memory>
 
-namespace mlir{
+namespace mlir {
 
 class ModuleOp;
 template <typename T> class OperationPass;
 
-namespace triton{
+namespace triton {
 
-std::unique_ptr<OperationPass<ModuleOp>> 
+std::unique_ptr<OperationPass<ModuleOp>>
 createConvertTritonToTritonGPUPass(int numWarps = 4);
 
 }
 } // namespace mlir
 
-
 #endif
\ No newline at end of file
diff --git a/include/triton/Dialect/Triton/IR/Dialect.h b/include/triton/Dialect/Triton/IR/Dialect.h
index 80a2aab2e..8590db9c4 100644
--- a/include/triton/Dialect/Triton/IR/Dialect.h
+++ b/include/triton/Dialect/Triton/IR/Dialect.h
@@ -1,17 +1,16 @@
 #ifndef TRITON_DIALECT_TRITON_IR_DIALECT_H_
 #define TRITON_DIALECT_TRITON_IR_DIALECT_H_
 
-
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/SCF/SCF.h"
 
-#include "triton/Dialect/Triton/IR/Traits.h"
-#include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/Triton/IR/Dialect.h.inc"
 #include "triton/Dialect/Triton/IR/OpsEnums.h.inc"
+#include "triton/Dialect/Triton/IR/Traits.h"
+#include "triton/Dialect/Triton/IR/Types.h"
 
 #define GET_OP_CLASSES
 #include "triton/Dialect/Triton/IR/Ops.h.inc"
diff --git a/include/triton/Dialect/Triton/IR/Traits.h b/include/triton/Dialect/Triton/IR/Traits.h
index a72e2c99c..fd20236f1 100644
--- a/include/triton/Dialect/Triton/IR/Traits.h
+++ b/include/triton/Dialect/Triton/IR/Traits.h
@@ -19,7 +19,7 @@ public:
   static LogicalResult verifyTrait(Operation *op) {
     // The rationale for this number is to prevent users from creating programs
     // that would have catastrophic register pressure and cause the compiler to
-    // hang. 
+    // hang.
     // Since H100 has 256KB registers, we should allow users to create tensors
     // of size up to 256K elements. It will spill for datatypes wider than 1B,
     // but we probably should limit number of elements (rather than bytes) to
@@ -31,8 +31,8 @@ public:
         for (int64_t s : tensorType.getShape())
           numElements *= s;
         if (numElements > maxElement)
-          return op->emitError("Maximum allowed number of elements is ") << maxElement << ", but "
-                 << *op << " has more than that";
+          return op->emitError("Maximum allowed number of elements is ")
+                 << maxElement << ", but " << *op << " has more than that";
         if ((numElements & (numElements - 1)) != 0)
           return op->emitError("Number of elements must be power-of-two, but ")
                  << *op << " doesn't follow the rule";
@@ -45,8 +45,8 @@ public:
         for (int64_t s : tensorType.getShape())
           numElements *= s;
         if (numElements > maxElement)
-          return op->emitError("Maximum allowed number of elements is ") << maxElement << ", but "
-                 << *op << " has more than that";
+          return op->emitError("Maximum allowed number of elements is ")
+                 << maxElement << ", but " << *op << " has more than that";
         if ((numElements & (numElements - 1)) != 0)
           return op->emitError("Number of elements must be power-of-two, but ")
                  << *op << " doesn't follow the rule";
@@ -57,7 +57,7 @@ public:
   }
 };
 
-}
-}
+} // namespace OpTrait
+} // namespace mlir
 
 #endif
diff --git a/include/triton/Dialect/Triton/Transforms/Passes.h b/include/triton/Dialect/Triton/Transforms/Passes.h
index 1064501b1..5dae1a498 100644
--- a/include/triton/Dialect/Triton/Transforms/Passes.h
+++ b/include/triton/Dialect/Triton/Transforms/Passes.h
@@ -13,6 +13,6 @@ std::unique_ptr<Pass> createCombineOpsPass();
 #define GEN_PASS_REGISTRATION
 #include "triton/Dialect/Triton/Transforms/Passes.h.inc"
 
-}
+} // namespace mlir
 
 #endif
diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
index dfa5ef864..9e8605ec8 100644
--- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -15,5 +15,4 @@
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Ops.h.inc"
 
-
 #endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
diff --git a/include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h b/include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h
index fd9048570..6cb59c327 100644
--- a/include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h
+++ b/include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h
@@ -14,6 +14,7 @@ namespace mlir {
 class TritonGPUTypeConverter : public TypeConverter {
 public:
   TritonGPUTypeConverter(MLIRContext *context, int numThreads);
+
 private:
   MLIRContext *context;
   int numThreads;
@@ -21,8 +22,10 @@ private:
 
 class TritonGPUConversionTarget : public ConversionTarget {
   TritonGPUTypeConverter &typeConverter;
+
 public:
-  explicit TritonGPUConversionTarget(MLIRContext &ctx, TritonGPUTypeConverter &typeConverter);
+  explicit TritonGPUConversionTarget(MLIRContext &ctx,
+                                     TritonGPUTypeConverter &typeConverter);
 
   /// update layouts & insert ConvertLayoutOp if necessary
   LogicalResult refineLayouts(ModuleOp mod, int numThreads);
diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h
old mode 100755
new mode 100644
index 5503bacaf..85fc2cbc9
--- a/include/triton/driver/dispatch.h
+++ b/include/triton/driver/dispatch.h
@@ -3,10 +3,10 @@
 #ifndef _TRITON_DRIVER_DISPATCH_H_
 #define _TRITON_DRIVER_DISPATCH_H_
 
-#include <type_traits>
 #include <dlfcn.h>
+#include <type_traits>
 
-//CUDA Backend
+// CUDA Backend
 #include "triton/external/CUDA/cuda.h"
 #include "triton/external/CUDA/nvml.h"
 
@@ -14,47 +14,43 @@
 //#define __HIP_PLATFORM_AMD__
 #include "triton/external/hip.h"
 
-//Exceptions
+// Exceptions
 #include <iostream>
 #include <stdexcept>
 
 namespace llvm {
 class PassRegistry;
 class Module;
-}
+} // namespace llvm
 
-namespace triton
-{
-namespace driver
-{
+namespace triton {
+namespace driver {
 
 class cu_context;
 
-template<class T> void check(T){}
+template <class T> void check(T) {}
 void check(CUresult err);
 void check(hipError_t err);
 
-class dispatch
-{
+class dispatch {
 protected:
-  template <class F>
-  struct return_type;
+  template <class F> struct return_type;
 
-  template <class R, class... A>
-  struct return_type<R (*)(A...)>
-  { typedef R type; };
+  template <class R, class... A> struct return_type<R (*)(A...)> {
+    typedef R type;
+  };
 
   typedef bool (*f_init_t)();
 
-  template<f_init_t initializer, typename FunPtrT, typename... Args>
-  static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
-  {
+  template <f_init_t initializer, typename FunPtrT, typename... Args>
+  static typename return_type<FunPtrT>::type
+  f_impl(void *&lib_h, FunPtrT, void *&cache, const char *name, Args... args) {
     initializer();
-    if(cache == nullptr){
+    if (cache == nullptr) {
       cache = dlsym(lib_h, name);
-			if(cache == 0)
-				throw std::runtime_error("dlsym unable to load function");
-		}
+      if (cache == 0)
+        throw std::runtime_error("dlsym unable to load function");
+    }
     FunPtrT fptr;
     *reinterpret_cast<void **>(&fptr) = cache;
     typename return_type<FunPtrT>::type res = (*fptr)(args...);
@@ -76,63 +72,99 @@ public:
   // context management
   static CUresult cuInit(unsigned int Flags);
   static CUresult cuCtxDestroy_v2(CUcontext ctx);
-  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
+  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags,
+                                 CUdevice dev);
   static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
   static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
-  static CUresult cuCtxGetDevice(CUdevice* result);
-  static CUresult cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int flags);
+  static CUresult cuCtxGetDevice(CUdevice *result);
+  static CUresult cuCtxEnablePeerAccess(CUcontext peerContext,
+                                        unsigned int flags);
   static CUresult cuDriverGetVersion(int *driverVersion);
   // device management
   static CUresult cuDeviceGet(CUdevice *device, int ordinal);
   static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
   static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
-  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
+                                       CUdevice dev);
   static CUresult cuDeviceGetCount(int *count);
   // link management
-  static CUresult cuLinkAddData_v2(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name, unsigned int numOptions, CUjit_option* options, void** optionValues);
-  static CUresult cuLinkCreate_v2(unsigned int  numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut);
-  static CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut);
+  static CUresult cuLinkAddData_v2(CUlinkState state, CUjitInputType type,
+                                   void *data, size_t size, const char *name,
+                                   unsigned int numOptions,
+                                   CUjit_option *options, void **optionValues);
+  static CUresult cuLinkCreate_v2(unsigned int numOptions,
+                                  CUjit_option *options, void **optionValues,
+                                  CUlinkState *stateOut);
+  static CUresult cuLinkComplete(CUlinkState state, void **cubinOut,
+                                 size_t *sizeOut);
   static CUresult cuLinkDestroy(CUlinkState state);
   // module management
-  static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t* bytes, CUmodule hmod, const char *name);
+  static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t *bytes,
+                                       CUmodule hmod, const char *name);
   static CUresult cuModuleLoad(CUmodule *module, const char *fname);
-  static CUresult cuModuleLoadData(CUmodule* module, const void* image);
+  static CUresult cuModuleLoadData(CUmodule *module, const void *image);
   static CUresult cuModuleUnload(CUmodule hmod);
-  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
+                                     unsigned int numOptions,
+                                     CUjit_option *options,
+                                     void **optionValues);
+  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
+                                      const char *name);
   // stream management
   static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
   static CUresult cuStreamSynchronize(CUstream hStream);
-  static CUresult cuStreamGetCtx(CUstream hStream, CUcontext* pctx);
+  static CUresult cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
   static CUresult cuStreamDestroy_v2(CUstream hStream);
-  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
+                                 unsigned int gridDimY, unsigned int gridDimZ,
+                                 unsigned int blockDimX, unsigned int blockDimY,
+                                 unsigned int blockDimZ,
+                                 unsigned int sharedMemBytes, CUstream hStream,
+                                 void **kernelParams, void **extra);
   // function management
-  static CUresult cuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc);
-  static CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
+  static CUresult cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
+                                     CUfunction hfunc);
+  static CUresult cuFuncSetAttribute(CUfunction hfunc,
+                                     CUfunction_attribute attrib, int value);
   static CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
   // memory management
   static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
-  static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
-  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream);
-  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+  static CUresult cuPointerGetAttribute(void *data,
+                                        CUpointer_attribute attribute,
+                                        CUdeviceptr ptr);
+  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N,
+                                  CUstream stream);
+  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice,
+                                  size_t ByteCount);
   static CUresult cuMemFree_v2(CUdeviceptr dptr);
-  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice,
+                                       size_t ByteCount, CUstream hStream);
+  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice,
+                                       const void *srcHost, size_t ByteCount,
+                                       CUstream hStream);
+  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost,
+                                  size_t ByteCount);
   // event management
   static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
-  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
+                                     CUevent hEnd);
   static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
   static CUresult cuEventDestroy_v2(CUevent hEvent);
 
-
   /* ------------------- *
    * NVML
    * ------------------- */
-  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
-  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);
+  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId,
+                                                       nvmlDevice_t *device);
+  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device,
+                                             nvmlClockType_t type,
+                                             unsigned int *clock);
+  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device,
+                                                nvmlClockType_t type,
+                                                unsigned int *clock);
+  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device,
+                                                      unsigned int mem_clock,
+                                                      unsigned int sm_clock);
 
   /* ------------------- *
    * HIP
@@ -140,177 +172,198 @@ public:
   // context management
   static hipError_t hipInit(unsigned int Flags);
   static hipError_t hipCtxDestroy(hipCtx_t ctx);
-  static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
+  static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags,
+                                 hipDevice_t dev);
   static hipError_t hipCtxPushCurrent(hipCtx_t ctx);
   static hipError_t hipCtxPopCurrent(hipCtx_t *pctx);
-  static hipError_t hipCtxGetDevice(hipDevice_t* result);
-  static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext, unsigned int flags);
+  static hipError_t hipCtxGetDevice(hipDevice_t *result);
+  static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext,
+                                           unsigned int flags);
   static hipError_t hipDriverGetVersion(int *driverVersion);
   // device management
   static hipError_t hipGetDevice(hipDevice_t *device, int ordinal);
   static hipError_t hipDeviceGetName(char *name, int len, hipDevice_t dev);
   static hipError_t hipDeviceGetPCIBusId(char *id, int len, hipDevice_t dev);
-  static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
+  static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib,
+                                          hipDevice_t dev);
   static hipError_t hipGetDeviceCount(int *count);
   // module management
-  static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t* bytes, hipModule_t hmod, const char *name);
+  static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
+                                       hipModule_t hmod, const char *name);
   static hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
-  static hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
+  static hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
   static hipError_t hipModuleUnload(hipModule_t hmod);
-  static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
-  static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name);
+  static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image,
+                                        unsigned int numOptions,
+                                        hipJitOption *options,
+                                        void **optionValues);
+  static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
+                                         const char *name);
   // stream management
   static hipError_t hipStreamCreate(hipStream_t *phStream, unsigned int Flags);
   static hipError_t hipStreamSynchronize(hipStream_t hStream);
   static hipError_t hipStreamDestroy(hipStream_t hStream);
-  static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
+  static hipError_t
+  hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
+                        unsigned int gridDimY, unsigned int gridDimZ,
+                        unsigned int blockDimX, unsigned int blockDimY,
+                        unsigned int blockDimZ, unsigned int sharedMemBytes,
+                        hipStream_t hStream, void **kernelParams, void **extra);
   // function management
-  static hipError_t hipFuncGetAttributes(hipFuncAttributes* attrib, void* hfunc);
-  static hipError_t hipFuncSetAttribute(hipFunction_t hfunc, hipFuncAttribute attrib, int value);
-  static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc, hipFuncCache_t config);
+  static hipError_t hipFuncGetAttributes(hipFuncAttributes *attrib,
+                                         void *hfunc);
+  static hipError_t hipFuncSetAttribute(hipFunction_t hfunc,
+                                        hipFuncAttribute attrib, int value);
+  static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc,
+                                          hipFuncCache_t config);
   // memory management
   static hipError_t hipMalloc(hipDeviceptr_t *dptr, size_t bytesize);
-  static hipError_t hipPointerGetAttribute(void * data, CUpointer_attribute attribute, hipDeviceptr_t ptr);
-  static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x, size_t N, hipStream_t stream);
-  static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
+  static hipError_t hipPointerGetAttribute(void *data,
+                                           CUpointer_attribute attribute,
+                                           hipDeviceptr_t ptr);
+  static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x,
+                                     size_t N, hipStream_t stream);
+  static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice,
+                                  size_t ByteCount);
   static hipError_t hipFree(hipDeviceptr_t dptr);
-  static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
-  static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
-  static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
+  static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice,
+                                       size_t ByteCount, hipStream_t hStream);
+  static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice,
+                                       const void *srcHost, size_t ByteCount,
+                                       hipStream_t hStream);
+  static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost,
+                                  size_t ByteCount);
   // event management
   static hipError_t hipEventCreate(hipEvent_t *phEvent, unsigned int Flags);
-  static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
+  static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart,
+                                        hipEvent_t hEnd);
   static hipError_t hipEventRecord(hipEvent_t hEvent, hipStream_t hStream);
   static hipError_t hipEventDestroy(hipEvent_t hEvent);
 
-
-
 private:
-
   // Libraries
-  static void* cuda_;
-  static void* nvml_;
-  static void* hip_;
-
+  static void *cuda_;
+  static void *nvml_;
+  static void *hip_;
 
   /* ------------------- *
    * CUDA
    * ------------------- */
   // context management
-  static void* cuCtxGetCurrent_;
-  static void* cuCtxSetCurrent_;
-  static void* cuCtxDestroy_v2_;
-  static void* cuCtxCreate_v2_;
-  static void* cuCtxGetDevice_;
-  static void* cuCtxPushCurrent_v2_;
-  static void* cuCtxPopCurrent_v2_;
-  static void* cuCtxEnablePeerAccess_;
-  static void* cuDriverGetVersion_;
-  static void* cuInit_;
+  static void *cuCtxGetCurrent_;
+  static void *cuCtxSetCurrent_;
+  static void *cuCtxDestroy_v2_;
+  static void *cuCtxCreate_v2_;
+  static void *cuCtxGetDevice_;
+  static void *cuCtxPushCurrent_v2_;
+  static void *cuCtxPopCurrent_v2_;
+  static void *cuCtxEnablePeerAccess_;
+  static void *cuDriverGetVersion_;
+  static void *cuInit_;
   // device management
-  static void* cuDeviceGet_;
-  static void* cuDeviceGetName_;
-  static void* cuDeviceGetPCIBusId_;
-  static void* cuDeviceGetAttribute_;
-  static void* cuDeviceGetCount_;
+  static void *cuDeviceGet_;
+  static void *cuDeviceGetName_;
+  static void *cuDeviceGetPCIBusId_;
+  static void *cuDeviceGetAttribute_;
+  static void *cuDeviceGetCount_;
   // link management
-  static void* cuLinkAddData_v2_;
-  static void* cuLinkCreate_v2_;
-  static void* cuLinkDestroy_;
-  static void* cuLinkComplete_;
+  static void *cuLinkAddData_v2_;
+  static void *cuLinkCreate_v2_;
+  static void *cuLinkDestroy_;
+  static void *cuLinkComplete_;
   // module management
-  static void* cuModuleGetGlobal_v2_;
-  static void* cuModuleLoad_;
-  static void* cuModuleUnload_;
-  static void* cuModuleLoadDataEx_;
-  static void* cuModuleLoadData_;
-  static void* cuModuleGetFunction_;
+  static void *cuModuleGetGlobal_v2_;
+  static void *cuModuleLoad_;
+  static void *cuModuleUnload_;
+  static void *cuModuleLoadDataEx_;
+  static void *cuModuleLoadData_;
+  static void *cuModuleGetFunction_;
   // stream management
-  static void* cuStreamCreate_;
-  static void* cuStreamSynchronize_;
-  static void* cuStreamDestroy_v2_;
-  static void* cuStreamGetCtx_;
-  static void* cuLaunchKernel_;
+  static void *cuStreamCreate_;
+  static void *cuStreamSynchronize_;
+  static void *cuStreamDestroy_v2_;
+  static void *cuStreamGetCtx_;
+  static void *cuLaunchKernel_;
   // function management
-  static void* cuFuncGetAttribute_;
-  static void* cuFuncSetAttribute_;
-  static void* cuFuncSetCacheConfig_;
+  static void *cuFuncGetAttribute_;
+  static void *cuFuncSetAttribute_;
+  static void *cuFuncSetCacheConfig_;
   // memory management
-  static void* cuMemcpyDtoH_v2_;
-  static void* cuMemFree_v2_;
-  static void* cuMemcpyDtoHAsync_v2_;
-  static void* cuMemcpyHtoDAsync_v2_;
-  static void* cuMemcpyHtoD_v2_;
-  static void* cuMemAlloc_v2_;
-  static void* cuMemsetD8Async_;
-  static void* cuPointerGetAttribute_;
+  static void *cuMemcpyDtoH_v2_;
+  static void *cuMemFree_v2_;
+  static void *cuMemcpyDtoHAsync_v2_;
+  static void *cuMemcpyHtoDAsync_v2_;
+  static void *cuMemcpyHtoD_v2_;
+  static void *cuMemAlloc_v2_;
+  static void *cuMemsetD8Async_;
+  static void *cuPointerGetAttribute_;
   // event management
-  static void* cuEventCreate_;
-  static void* cuEventElapsedTime_;
-  static void* cuEventRecord_;
-  static void* cuEventDestroy_v2_;
+  static void *cuEventCreate_;
+  static void *cuEventElapsedTime_;
+  static void *cuEventRecord_;
+  static void *cuEventDestroy_v2_;
 
   /* ------------------- *
    * NVML
    * ------------------- */
-  static void* nvmlInit_v2_;
-  static void* nvmlDeviceGetHandleByPciBusId_v2_;
-  static void* nvmlDeviceGetClockInfo_;
-  static void* nvmlDeviceGetMaxClockInfo_;
-  static void* nvmlDeviceSetApplicationsClocks_;
+  static void *nvmlInit_v2_;
+  static void *nvmlDeviceGetHandleByPciBusId_v2_;
+  static void *nvmlDeviceGetClockInfo_;
+  static void *nvmlDeviceGetMaxClockInfo_;
+  static void *nvmlDeviceSetApplicationsClocks_;
 
   /* ------------------- *
    * HIP
    * ------------------- */
   // context management
-  static void* hipInit_;
-  static void* hipCtxDestroy_;
-  static void* hipCtxCreate_;
-  static void* hipCtxPushCurrent_;
-  static void* hipCtxPopCurrent_;
-  static void* hipCtxGetDevice_;
-  static void* hipCtxEnablePeerAccess_;
-  static void* hipDriverGetVersion_;
+  static void *hipInit_;
+  static void *hipCtxDestroy_;
+  static void *hipCtxCreate_;
+  static void *hipCtxPushCurrent_;
+  static void *hipCtxPopCurrent_;
+  static void *hipCtxGetDevice_;
+  static void *hipCtxEnablePeerAccess_;
+  static void *hipDriverGetVersion_;
   // device management
-  static void* hipGetDevice_;
-  static void* hipDeviceGetName_;
-  static void* hipDeviceGetPCIBusId_;
-  static void* hipDeviceGetAttribute_;
-  static void* hipGetDeviceCount_;
+  static void *hipGetDevice_;
+  static void *hipDeviceGetName_;
+  static void *hipDeviceGetPCIBusId_;
+  static void *hipDeviceGetAttribute_;
+  static void *hipGetDeviceCount_;
   // module management
-  static void* hipModuleGetGlobal_;
-  static void* hipModuleLoad_;
-  static void* hipModuleLoadData_;
-  static void* hipModuleUnload_;
-  static void* hipModuleLoadDataEx_;
-  static void* hipModuleGetFunction_;
+  static void *hipModuleGetGlobal_;
+  static void *hipModuleLoad_;
+  static void *hipModuleLoadData_;
+  static void *hipModuleUnload_;
+  static void *hipModuleLoadDataEx_;
+  static void *hipModuleGetFunction_;
   // stream management
-  static void* hipStreamCreate_;
-  static void* hipStreamSynchronize_;
-  static void* hipStreamDestroy_;
-  static void* hipModuleLaunchKernel_;;
+  static void *hipStreamCreate_;
+  static void *hipStreamSynchronize_;
+  static void *hipStreamDestroy_;
+  static void *hipModuleLaunchKernel_;
+  ;
   // function management
-  static void* hipFuncGetAttributes_;
-  static void* hipFuncSetAttribute_;
-  static void* hipFuncSetCacheConfig_;
+  static void *hipFuncGetAttributes_;
+  static void *hipFuncSetAttribute_;
+  static void *hipFuncSetCacheConfig_;
   // memory management
-  static void* hipMalloc_;
-  static void* hipPointerGetAttribute_;
-  static void* hipMemsetD8Async_;
-  static void* hipMemcpyDtoH_;
-  static void* hipFree_;
-  static void* hipMemcpyDtoHAsync_;
-  static void* hipMemcpyHtoDAsync_;
-  static void* hipMemcpyHtoD_;
+  static void *hipMalloc_;
+  static void *hipPointerGetAttribute_;
+  static void *hipMemsetD8Async_;
+  static void *hipMemcpyDtoH_;
+  static void *hipFree_;
+  static void *hipMemcpyDtoHAsync_;
+  static void *hipMemcpyHtoDAsync_;
+  static void *hipMemcpyHtoD_;
   // event management
-  static void* hipEventCreate_;
-  static void* hipEventElapsedTime_;
-  static void* hipEventRecord_;
-  static void* hipEventDestroy_;
+  static void *hipEventCreate_;
+  static void *hipEventElapsedTime_;
+  static void *hipEventRecord_;
+  static void *hipEventDestroy_;
 };
 
-}
-}
-
+} // namespace driver
+} // namespace triton
 
 #endif
diff --git a/include/triton/driver/error.h b/include/triton/driver/error.h
old mode 100755
new mode 100644
index c3168c1ad..229e1dee4
--- a/include/triton/driver/error.h
+++ b/include/triton/driver/error.h
@@ -3,223 +3,252 @@
 #ifndef _TRITON_DRIVER_ERROR_H_
 #define _TRITON_DRIVER_ERROR_H_
 
-#include <exception>
 #include "triton/driver/dispatch.h"
+#include <exception>
 
+namespace triton {
 
-namespace triton
-{
+namespace driver {
 
-  namespace driver
-  {
+namespace exception {
 
-  namespace exception
-  {
+namespace nvrtc {
 
-  namespace nvrtc
-  {
+#define TRITON_CREATE_NVRTC_EXCEPTION(name, msg)                               \
+  class name : public std::exception {                                         \
+  public:                                                                      \
+    const char *what() const throw() override { return "NVRTC: Error- " msg; } \
+  }
 
-#define TRITON_CREATE_NVRTC_EXCEPTION(name, msg) \
-class name: public std::exception { public: const char * what() const throw() override { return "NVRTC: Error- " msg; } }
-
-  TRITON_CREATE_NVRTC_EXCEPTION(out_of_memory              ,"out of memory");
-  TRITON_CREATE_NVRTC_EXCEPTION(program_creation_failure   ,"program creation failure");
-  TRITON_CREATE_NVRTC_EXCEPTION(invalid_input              ,"invalid input");
-  TRITON_CREATE_NVRTC_EXCEPTION(invalid_program            ,"invalid program");
-  TRITON_CREATE_NVRTC_EXCEPTION(invalid_option             ,"invalid option");
-  TRITON_CREATE_NVRTC_EXCEPTION(compilation                ,"compilation");
-  TRITON_CREATE_NVRTC_EXCEPTION(builtin_operation_failure  ,"builtin operation failure");
-  TRITON_CREATE_NVRTC_EXCEPTION(unknown_error              ,"unknown error");
+TRITON_CREATE_NVRTC_EXCEPTION(out_of_memory, "out of memory");
+TRITON_CREATE_NVRTC_EXCEPTION(program_creation_failure,
+                              "program creation failure");
+TRITON_CREATE_NVRTC_EXCEPTION(invalid_input, "invalid input");
+TRITON_CREATE_NVRTC_EXCEPTION(invalid_program, "invalid program");
+TRITON_CREATE_NVRTC_EXCEPTION(invalid_option, "invalid option");
+TRITON_CREATE_NVRTC_EXCEPTION(compilation, "compilation");
+TRITON_CREATE_NVRTC_EXCEPTION(builtin_operation_failure,
+                              "builtin operation failure");
+TRITON_CREATE_NVRTC_EXCEPTION(unknown_error, "unknown error");
 
 #undef TRITON_CREATE_NVRTC_EXCEPTION
+} // namespace nvrtc
+
+namespace cuda {
+class base : public std::exception {};
+
+#define TRITON_CREATE_CUDA_EXCEPTION(name, msg)                                \
+  class name : public base {                                                   \
+  public:                                                                      \
+    const char *what() const throw() override { return "CUDA: Error- " msg; }  \
   }
 
-
-  namespace cuda
-  {
-  class base: public std::exception{};
-
-#define TRITON_CREATE_CUDA_EXCEPTION(name, msg) \
-class name: public base { public:const char * what() const throw() override { return "CUDA: Error- " msg; } }
-
-
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_value                   ,"invalid value");
-  TRITON_CREATE_CUDA_EXCEPTION(out_of_memory                   ,"out of memory");
-  TRITON_CREATE_CUDA_EXCEPTION(not_initialized                 ,"not initialized");
-  TRITON_CREATE_CUDA_EXCEPTION(deinitialized                   ,"deinitialized");
-  TRITON_CREATE_CUDA_EXCEPTION(profiler_disabled               ,"profiler disabled");
-  TRITON_CREATE_CUDA_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
-  TRITON_CREATE_CUDA_EXCEPTION(profiler_already_started        ,"profiler already started");
-  TRITON_CREATE_CUDA_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
-  TRITON_CREATE_CUDA_EXCEPTION(no_device                       ,"no device");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_device                  ,"invalid device");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_image                   ,"invalid image");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_context                 ,"invalid context");
-  TRITON_CREATE_CUDA_EXCEPTION(context_already_current         ,"context already current");
-  TRITON_CREATE_CUDA_EXCEPTION(map_failed                      ,"map failed");
-  TRITON_CREATE_CUDA_EXCEPTION(unmap_failed                    ,"unmap failed");
-  TRITON_CREATE_CUDA_EXCEPTION(array_is_mapped                 ,"array is mapped");
-  TRITON_CREATE_CUDA_EXCEPTION(already_mapped                  ,"already mapped");
-  TRITON_CREATE_CUDA_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
-  TRITON_CREATE_CUDA_EXCEPTION(already_acquired                ,"already acquired");
-  TRITON_CREATE_CUDA_EXCEPTION(not_mapped                      ,"not mapped");
-  TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
-  TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
-  TRITON_CREATE_CUDA_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
-  TRITON_CREATE_CUDA_EXCEPTION(unsupported_limit               ,"unsupported limit");
-  TRITON_CREATE_CUDA_EXCEPTION(context_already_in_use          ,"context already in use");
-  TRITON_CREATE_CUDA_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_ptx                     ,"invalid ptx");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_source                  ,"invalid source");
-  TRITON_CREATE_CUDA_EXCEPTION(file_not_found                  ,"file not found");
-  TRITON_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
-  TRITON_CREATE_CUDA_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
-  TRITON_CREATE_CUDA_EXCEPTION(operating_system                ,"operating system");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_handle                  ,"invalid handle");
-  TRITON_CREATE_CUDA_EXCEPTION(not_found                       ,"not found");
-  TRITON_CREATE_CUDA_EXCEPTION(not_ready                       ,"not ready");
-  TRITON_CREATE_CUDA_EXCEPTION(illegal_address                 ,"illegal address");
-  TRITON_CREATE_CUDA_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
-  TRITON_CREATE_CUDA_EXCEPTION(launch_timeout                  ,"launch timeout");
-  TRITON_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
-  TRITON_CREATE_CUDA_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
-  TRITON_CREATE_CUDA_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
-  TRITON_CREATE_CUDA_EXCEPTION(primary_context_active          ,"primary context active");
-  TRITON_CREATE_CUDA_EXCEPTION(context_is_destroyed            ,"context is destroyed");
-  TRITON_CREATE_CUDA_EXCEPTION(assert_error                    ,"assert");
-  TRITON_CREATE_CUDA_EXCEPTION(too_many_peers                  ,"too many peers");
-  TRITON_CREATE_CUDA_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
-  TRITON_CREATE_CUDA_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
-  TRITON_CREATE_CUDA_EXCEPTION(hardware_stack_error            ,"hardware stack error");
-  TRITON_CREATE_CUDA_EXCEPTION(illegal_instruction             ,"illegal instruction");
-  TRITON_CREATE_CUDA_EXCEPTION(misaligned_address              ,"misaligned address");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_address_space           ,"invalid address space");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_pc                      ,"invalid pc");
-  TRITON_CREATE_CUDA_EXCEPTION(launch_failed                   ,"launch failed");
-  TRITON_CREATE_CUDA_EXCEPTION(not_permitted                   ,"not permitted");
-  TRITON_CREATE_CUDA_EXCEPTION(not_supported                   ,"not supported");
-  TRITON_CREATE_CUDA_EXCEPTION(unknown                         ,"unknown");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_value, "invalid value");
+TRITON_CREATE_CUDA_EXCEPTION(out_of_memory, "out of memory");
+TRITON_CREATE_CUDA_EXCEPTION(not_initialized, "not initialized");
+TRITON_CREATE_CUDA_EXCEPTION(deinitialized, "deinitialized");
+TRITON_CREATE_CUDA_EXCEPTION(profiler_disabled, "profiler disabled");
+TRITON_CREATE_CUDA_EXCEPTION(profiler_not_initialized,
+                             "profiler not initialized");
+TRITON_CREATE_CUDA_EXCEPTION(profiler_already_started,
+                             "profiler already started");
+TRITON_CREATE_CUDA_EXCEPTION(profiler_already_stopped,
+                             "profiler already stopped");
+TRITON_CREATE_CUDA_EXCEPTION(no_device, "no device");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_device, "invalid device");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_image, "invalid image");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_context, "invalid context");
+TRITON_CREATE_CUDA_EXCEPTION(context_already_current,
+                             "context already current");
+TRITON_CREATE_CUDA_EXCEPTION(map_failed, "map failed");
+TRITON_CREATE_CUDA_EXCEPTION(unmap_failed, "unmap failed");
+TRITON_CREATE_CUDA_EXCEPTION(array_is_mapped, "array is mapped");
+TRITON_CREATE_CUDA_EXCEPTION(already_mapped, "already mapped");
+TRITON_CREATE_CUDA_EXCEPTION(no_binary_for_gpu, "no binary for gpu");
+TRITON_CREATE_CUDA_EXCEPTION(already_acquired, "already acquired");
+TRITON_CREATE_CUDA_EXCEPTION(not_mapped, "not mapped");
+TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_array, "not mapped as array");
+TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer, "not mapped as pointer");
+TRITON_CREATE_CUDA_EXCEPTION(ecc_uncorrectable, "ecc uncorrectable");
+TRITON_CREATE_CUDA_EXCEPTION(unsupported_limit, "unsupported limit");
+TRITON_CREATE_CUDA_EXCEPTION(context_already_in_use, "context already in use");
+TRITON_CREATE_CUDA_EXCEPTION(peer_access_unsupported,
+                             "peer access unsupported");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_ptx, "invalid ptx");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_graphics_context,
+                             "invalid graphics context");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_source, "invalid source");
+TRITON_CREATE_CUDA_EXCEPTION(file_not_found, "file not found");
+TRITON_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found,
+                             "shared object symbol not found");
+TRITON_CREATE_CUDA_EXCEPTION(shared_object_init_failed,
+                             "shared object init failed");
+TRITON_CREATE_CUDA_EXCEPTION(operating_system, "operating system");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_handle, "invalid handle");
+TRITON_CREATE_CUDA_EXCEPTION(not_found, "not found");
+TRITON_CREATE_CUDA_EXCEPTION(not_ready, "not ready");
+TRITON_CREATE_CUDA_EXCEPTION(illegal_address, "illegal address");
+TRITON_CREATE_CUDA_EXCEPTION(launch_out_of_resources,
+                             "launch out of resources");
+TRITON_CREATE_CUDA_EXCEPTION(launch_timeout, "launch timeout");
+TRITON_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing,
+                             "launch incompatible texturing");
+TRITON_CREATE_CUDA_EXCEPTION(peer_access_already_enabled,
+                             "peer access already enabled");
+TRITON_CREATE_CUDA_EXCEPTION(peer_access_not_enabled,
+                             "peer access not enabled");
+TRITON_CREATE_CUDA_EXCEPTION(primary_context_active, "primary context active");
+TRITON_CREATE_CUDA_EXCEPTION(context_is_destroyed, "context is destroyed");
+TRITON_CREATE_CUDA_EXCEPTION(assert_error, "assert");
+TRITON_CREATE_CUDA_EXCEPTION(too_many_peers, "too many peers");
+TRITON_CREATE_CUDA_EXCEPTION(host_memory_already_registered,
+                             "host memory already registered");
+TRITON_CREATE_CUDA_EXCEPTION(host_memory_not_registered,
+                             "hot memory not registered");
+TRITON_CREATE_CUDA_EXCEPTION(hardware_stack_error, "hardware stack error");
+TRITON_CREATE_CUDA_EXCEPTION(illegal_instruction, "illegal instruction");
+TRITON_CREATE_CUDA_EXCEPTION(misaligned_address, "misaligned address");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_address_space, "invalid address space");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_pc, "invalid pc");
+TRITON_CREATE_CUDA_EXCEPTION(launch_failed, "launch failed");
+TRITON_CREATE_CUDA_EXCEPTION(not_permitted, "not permitted");
+TRITON_CREATE_CUDA_EXCEPTION(not_supported, "not supported");
+TRITON_CREATE_CUDA_EXCEPTION(unknown, "unknown");
 
 #undef TRITON_CREATE_CUDA_EXCEPTION
+} // namespace cuda
+
+namespace cublas {
+class base : public std::exception {};
+
+#define TRITON_CREATE_CUBLAS_EXCEPTION(name, msg)                              \
+  class name : public base {                                                   \
+  public:                                                                      \
+    const char *what() const throw() override {                                \
+      return "CUBLAS: Error- " msg;                                            \
+    }                                                                          \
   }
 
-  namespace cublas
-  {
-  class base: public std::exception{};
-
-#define TRITON_CREATE_CUBLAS_EXCEPTION(name, msg) \
-class name: public base { public: const char * what() const throw() override { return "CUBLAS: Error- " msg; } }
-
-  TRITON_CREATE_CUBLAS_EXCEPTION(not_initialized              ,"not initialized");
-  TRITON_CREATE_CUBLAS_EXCEPTION(alloc_failed                 ,"alloc failed");
-  TRITON_CREATE_CUBLAS_EXCEPTION(invalid_value                ,"invalid value");
-  TRITON_CREATE_CUBLAS_EXCEPTION(arch_mismatch                ,"arch mismatch");
-  TRITON_CREATE_CUBLAS_EXCEPTION(mapping_error                ,"mapping error");
-  TRITON_CREATE_CUBLAS_EXCEPTION(execution_failed             ,"execution failed");
-  TRITON_CREATE_CUBLAS_EXCEPTION(internal_error               ,"internal error");
-  TRITON_CREATE_CUBLAS_EXCEPTION(not_supported                ,"not supported");
-  TRITON_CREATE_CUBLAS_EXCEPTION(license_error                ,"license error");
-  TRITON_CREATE_CUBLAS_EXCEPTION(unknown                      ,"unknown");
+TRITON_CREATE_CUBLAS_EXCEPTION(not_initialized, "not initialized");
+TRITON_CREATE_CUBLAS_EXCEPTION(alloc_failed, "alloc failed");
+TRITON_CREATE_CUBLAS_EXCEPTION(invalid_value, "invalid value");
+TRITON_CREATE_CUBLAS_EXCEPTION(arch_mismatch, "arch mismatch");
+TRITON_CREATE_CUBLAS_EXCEPTION(mapping_error, "mapping error");
+TRITON_CREATE_CUBLAS_EXCEPTION(execution_failed, "execution failed");
+TRITON_CREATE_CUBLAS_EXCEPTION(internal_error, "internal error");
+TRITON_CREATE_CUBLAS_EXCEPTION(not_supported, "not supported");
+TRITON_CREATE_CUBLAS_EXCEPTION(license_error, "license error");
+TRITON_CREATE_CUBLAS_EXCEPTION(unknown, "unknown");
 
 #undef TRITON_CREATE_CUBLAS_EXCEPTION
+} // namespace cublas
+
+namespace cudnn {
+#define TRITON_CREATE_CUDNN_EXCEPTION(name, msg)                               \
+  class name : public std::exception {                                         \
+  public:                                                                      \
+    const char *what() const throw() override { return "CUDNN: Error- " msg; } \
   }
 
-  namespace cudnn
-  {
-#define TRITON_CREATE_CUDNN_EXCEPTION(name, msg) \
-class name: public std::exception { public: const char * what() const throw() override { return "CUDNN: Error- " msg; } }
+TRITON_CREATE_CUDNN_EXCEPTION(not_initialized, "not initialized");
+TRITON_CREATE_CUDNN_EXCEPTION(alloc_failed, "allocation failed");
+TRITON_CREATE_CUDNN_EXCEPTION(bad_param, "bad param");
+TRITON_CREATE_CUDNN_EXCEPTION(internal_error, "internal error");
+TRITON_CREATE_CUDNN_EXCEPTION(invalid_value, "invalid value");
+TRITON_CREATE_CUDNN_EXCEPTION(arch_mismatch, "arch mismatch");
+TRITON_CREATE_CUDNN_EXCEPTION(mapping_error, "mapping error");
+TRITON_CREATE_CUDNN_EXCEPTION(execution_failed, "execution failed");
+TRITON_CREATE_CUDNN_EXCEPTION(not_supported, "not supported");
+TRITON_CREATE_CUDNN_EXCEPTION(license_error, "license error");
+TRITON_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing,
+                              "prerequisite missing");
+TRITON_CREATE_CUDNN_EXCEPTION(runtime_in_progress, "runtime in progress");
+TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow, "runtime fp overflow");
+} // namespace cudnn
 
-  TRITON_CREATE_CUDNN_EXCEPTION(not_initialized              ,"not initialized");
-  TRITON_CREATE_CUDNN_EXCEPTION(alloc_failed                 ,"allocation failed");
-  TRITON_CREATE_CUDNN_EXCEPTION(bad_param                    ,"bad param");
-  TRITON_CREATE_CUDNN_EXCEPTION(internal_error               ,"internal error");
-  TRITON_CREATE_CUDNN_EXCEPTION(invalid_value                ,"invalid value");
-  TRITON_CREATE_CUDNN_EXCEPTION(arch_mismatch                ,"arch mismatch");
-  TRITON_CREATE_CUDNN_EXCEPTION(mapping_error                ,"mapping error");
-  TRITON_CREATE_CUDNN_EXCEPTION(execution_failed             ,"execution failed");
-  TRITON_CREATE_CUDNN_EXCEPTION(not_supported                ,"not supported");
-  TRITON_CREATE_CUDNN_EXCEPTION(license_error                ,"license error");
-  TRITON_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing ,"prerequisite missing");
-  TRITON_CREATE_CUDNN_EXCEPTION(runtime_in_progress          ,"runtime in progress");
-  TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow          ,"runtime fp overflow");
+namespace hip {
+class base : public std::exception {};
+
+#define TRITON_CREATE_HIP_EXCEPTION(name, msg)                                 \
+  class name : public base {                                                   \
+  public:                                                                      \
+    const char *what() const throw() override { return "HIP: Error- " msg; }   \
   }
 
-
-
-
-  namespace hip
-  {
-  class base: public std::exception{};
-
-#define TRITON_CREATE_HIP_EXCEPTION(name, msg) \
-class name: public base { public:const char * what() const throw() override { return "HIP: Error- " msg; } }
-
-
-  TRITON_CREATE_HIP_EXCEPTION(invalid_value                   ,"invalid value");
-  TRITON_CREATE_HIP_EXCEPTION(out_of_memory                   ,"out of memory");
-  TRITON_CREATE_HIP_EXCEPTION(not_initialized                 ,"not initialized");
-  TRITON_CREATE_HIP_EXCEPTION(deinitialized                   ,"deinitialized");
-  TRITON_CREATE_HIP_EXCEPTION(profiler_disabled               ,"profiler disabled");
-  TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
-  TRITON_CREATE_HIP_EXCEPTION(profiler_already_started        ,"profiler already started");
-  TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
-  TRITON_CREATE_HIP_EXCEPTION(no_device                       ,"no device");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_device                  ,"invalid device");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_image                   ,"invalid image");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_context                 ,"invalid context");
-  TRITON_CREATE_HIP_EXCEPTION(context_already_current         ,"context already current");
-  TRITON_CREATE_HIP_EXCEPTION(map_failed                      ,"map failed");
-  TRITON_CREATE_HIP_EXCEPTION(unmap_failed                    ,"unmap failed");
-  TRITON_CREATE_HIP_EXCEPTION(array_is_mapped                 ,"array is mapped");
-  TRITON_CREATE_HIP_EXCEPTION(already_mapped                  ,"already mapped");
-  TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
-  TRITON_CREATE_HIP_EXCEPTION(already_acquired                ,"already acquired");
-  TRITON_CREATE_HIP_EXCEPTION(not_mapped                      ,"not mapped");
-  TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
-  TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
-  TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
-  TRITON_CREATE_HIP_EXCEPTION(unsupported_limit               ,"unsupported limit");
-  TRITON_CREATE_HIP_EXCEPTION(context_already_in_use          ,"context already in use");
-  TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_ptx                     ,"invalid ptx");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_source                  ,"invalid source");
-  TRITON_CREATE_HIP_EXCEPTION(file_not_found                  ,"file not found");
-  TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
-  TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
-  TRITON_CREATE_HIP_EXCEPTION(operating_system                ,"operating system");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_handle                  ,"invalid handle");
-  TRITON_CREATE_HIP_EXCEPTION(not_found                       ,"not found");
-  TRITON_CREATE_HIP_EXCEPTION(not_ready                       ,"not ready");
-  TRITON_CREATE_HIP_EXCEPTION(illegal_address                 ,"illegal address");
-  TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
-  TRITON_CREATE_HIP_EXCEPTION(launch_timeout                  ,"launch timeout");
-  TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
-  TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
-  TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
-  TRITON_CREATE_HIP_EXCEPTION(primary_context_active          ,"primary context active");
-  TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed            ,"context is destroyed");
-  TRITON_CREATE_HIP_EXCEPTION(assert_error                    ,"assert");
-  TRITON_CREATE_HIP_EXCEPTION(too_many_peers                  ,"too many peers");
-  TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
-  TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
-  TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error            ,"hardware stack error");
-  TRITON_CREATE_HIP_EXCEPTION(illegal_instruction             ,"illegal instruction");
-  TRITON_CREATE_HIP_EXCEPTION(misaligned_address              ,"misaligned address");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_address_space           ,"invalid address space");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_pc                      ,"invalid pc");
-  TRITON_CREATE_HIP_EXCEPTION(launch_failed                   ,"launch failed");
-  TRITON_CREATE_HIP_EXCEPTION(not_permitted                   ,"not permitted");
-  TRITON_CREATE_HIP_EXCEPTION(not_supported                   ,"not supported");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_symbol                   ,"invalid symbol");
-  TRITON_CREATE_HIP_EXCEPTION(unknown                         ,"unknown");
+TRITON_CREATE_HIP_EXCEPTION(invalid_value, "invalid value");
+TRITON_CREATE_HIP_EXCEPTION(out_of_memory, "out of memory");
+TRITON_CREATE_HIP_EXCEPTION(not_initialized, "not initialized");
+TRITON_CREATE_HIP_EXCEPTION(deinitialized, "deinitialized");
+TRITON_CREATE_HIP_EXCEPTION(profiler_disabled, "profiler disabled");
+TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized,
+                            "profiler not initialized");
+TRITON_CREATE_HIP_EXCEPTION(profiler_already_started,
+                            "profiler already started");
+TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped,
+                            "profiler already stopped");
+TRITON_CREATE_HIP_EXCEPTION(no_device, "no device");
+TRITON_CREATE_HIP_EXCEPTION(invalid_device, "invalid device");
+TRITON_CREATE_HIP_EXCEPTION(invalid_image, "invalid image");
+TRITON_CREATE_HIP_EXCEPTION(invalid_context, "invalid context");
+TRITON_CREATE_HIP_EXCEPTION(context_already_current, "context already current");
+TRITON_CREATE_HIP_EXCEPTION(map_failed, "map failed");
+TRITON_CREATE_HIP_EXCEPTION(unmap_failed, "unmap failed");
+TRITON_CREATE_HIP_EXCEPTION(array_is_mapped, "array is mapped");
+TRITON_CREATE_HIP_EXCEPTION(already_mapped, "already mapped");
+TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu, "no binary for gpu");
+TRITON_CREATE_HIP_EXCEPTION(already_acquired, "already acquired");
+TRITON_CREATE_HIP_EXCEPTION(not_mapped, "not mapped");
+TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array, "not mapped as array");
+TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer, "not mapped as pointer");
+TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable, "ecc uncorrectable");
+TRITON_CREATE_HIP_EXCEPTION(unsupported_limit, "unsupported limit");
+TRITON_CREATE_HIP_EXCEPTION(context_already_in_use, "context already in use");
+TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported, "peer access unsupported");
+TRITON_CREATE_HIP_EXCEPTION(invalid_ptx, "invalid ptx");
+TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context,
+                            "invalid graphics context");
+TRITON_CREATE_HIP_EXCEPTION(invalid_source, "invalid source");
+TRITON_CREATE_HIP_EXCEPTION(file_not_found, "file not found");
+TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found,
+                            "shared object symbol not found");
+TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed,
+                            "shared object init failed");
+TRITON_CREATE_HIP_EXCEPTION(operating_system, "operating system");
+TRITON_CREATE_HIP_EXCEPTION(invalid_handle, "invalid handle");
+TRITON_CREATE_HIP_EXCEPTION(not_found, "not found");
+TRITON_CREATE_HIP_EXCEPTION(not_ready, "not ready");
+TRITON_CREATE_HIP_EXCEPTION(illegal_address, "illegal address");
+TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources, "launch out of resources");
+TRITON_CREATE_HIP_EXCEPTION(launch_timeout, "launch timeout");
+TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing,
+                            "launch incompatible texturing");
+TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled,
+                            "peer access already enabled");
+TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled, "peer access not enabled");
+TRITON_CREATE_HIP_EXCEPTION(primary_context_active, "primary context active");
+TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed, "context is destroyed");
+TRITON_CREATE_HIP_EXCEPTION(assert_error, "assert");
+TRITON_CREATE_HIP_EXCEPTION(too_many_peers, "too many peers");
+TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered,
+                            "host memory already registered");
+TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered,
+                            "hot memory not registered");
+TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error, "hardware stack error");
+TRITON_CREATE_HIP_EXCEPTION(illegal_instruction, "illegal instruction");
+TRITON_CREATE_HIP_EXCEPTION(misaligned_address, "misaligned address");
+TRITON_CREATE_HIP_EXCEPTION(invalid_address_space, "invalid address space");
+TRITON_CREATE_HIP_EXCEPTION(invalid_pc, "invalid pc");
+TRITON_CREATE_HIP_EXCEPTION(launch_failed, "launch failed");
+TRITON_CREATE_HIP_EXCEPTION(not_permitted, "not permitted");
+TRITON_CREATE_HIP_EXCEPTION(not_supported, "not supported");
+TRITON_CREATE_HIP_EXCEPTION(invalid_symbol, "invalid symbol");
+TRITON_CREATE_HIP_EXCEPTION(unknown, "unknown");
 
 #undef TRITON_CREATE_CUDA_EXCEPTION
-  }
+} // namespace hip
 
-  }
-  }
-}
+} // namespace exception
+} // namespace driver
+} // namespace triton
 
 #endif
diff --git a/include/triton/driver/llvm.h b/include/triton/driver/llvm.h
index c0c1c0f37..b3ce0d0cc 100644
--- a/include/triton/driver/llvm.h
+++ b/include/triton/driver/llvm.h
@@ -1,20 +1,21 @@
-#include <string>
 #include "triton/driver/dispatch.h"
+#include <string>
 
-namespace llvm{
+namespace llvm {
 class Module;
 }
 
-namespace triton{
-namespace driver{
+namespace triton {
+namespace driver {
 
 void init_llvm();
-std::string path_to_ptxas(int& version);
-std::string llir_to_ptx(llvm::Module* module, int cc, int version);
-std::string ptx_to_cubin(const std::string& ptx, const std::string& ptxas_path, int cc);
-CUmodule ptx_to_cumodule(const std::string& ptx, int cc);
-std::string llir_to_amdgpu(llvm::Module* module, const std::string& proc);
-hipModule_t amdgpu_to_hipmodule(const std::string& path);
+std::string path_to_ptxas(int &version);
+std::string llir_to_ptx(llvm::Module *module, int cc, int version);
+std::string ptx_to_cubin(const std::string &ptx, const std::string &ptxas_path,
+                         int cc);
+CUmodule ptx_to_cumodule(const std::string &ptx, int cc);
+std::string llir_to_amdgpu(llvm::Module *module, const std::string &proc);
+hipModule_t amdgpu_to_hipmodule(const std::string &path);
 
-}
-}
+} // namespace driver
+} // namespace triton
diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp
index c0dbd5061..258e06933 100644
--- a/include/triton/tools/bench.hpp
+++ b/include/triton/tools/bench.hpp
@@ -3,52 +3,55 @@
 #ifndef _TRITON_TOOLS_BENCH_H_
 #define _TRITON_TOOLS_BENCH_H_
 
-#include <chrono>
-#include <functional>
-#include <algorithm>
 #include "triton/driver/device.h"
 #include "triton/driver/stream.h"
+#include <algorithm>
+#include <chrono>
+#include <functional>
 
-namespace triton{
-namespace tools{
+namespace triton {
+namespace tools {
 
-class timer{
-    typedef std::chrono::high_resolution_clock high_resolution_clock;
-    typedef std::chrono::nanoseconds nanoseconds;
+class timer {
+  typedef std::chrono::high_resolution_clock high_resolution_clock;
+  typedef std::chrono::nanoseconds nanoseconds;
 
 public:
-    explicit timer(bool run = false)
-    { if (run) start(); }
+  explicit timer(bool run = false) {
+    if (run)
+      start();
+  }
 
-    void start()
-    { _start = high_resolution_clock::now(); }
+  void start() { _start = high_resolution_clock::now(); }
 
-    nanoseconds get() const
-    { return std::chrono::duration_cast<nanoseconds>(high_resolution_clock::now() - _start); }
+  nanoseconds get() const {
+    return std::chrono::duration_cast<nanoseconds>(
+        high_resolution_clock::now() - _start);
+  }
 
 private:
-    high_resolution_clock::time_point _start;
+  high_resolution_clock::time_point _start;
 };
 
-inline double bench(std::function<void()> const & op, driver::stream * stream, size_t warmup = 10, size_t repeat = 200)
-{
+inline double bench(std::function<void()> const &op, driver::stream *stream,
+                    size_t warmup = 10, size_t repeat = 200) {
   timer tmr;
   std::vector<size_t> times;
   double total_time = 0;
-  for(size_t i = 0; i < warmup; i++)
+  for (size_t i = 0; i < warmup; i++)
     op();
   stream->synchronize();
   tmr.start();
-  for(size_t i = 0; i < repeat; i++){
+  for (size_t i = 0; i < repeat; i++) {
     op();
   }
   stream->synchronize();
   return (float)tmr.get().count() / repeat;
 
-//  return *std::min_element(times.begin(), times.end());
+  //  return *std::min_element(times.begin(), times.end());
 }
 
-}
-}
+} // namespace tools
+} // namespace triton
 
 #endif
diff --git a/include/triton/tools/graph.h b/include/triton/tools/graph.h
index c2ba8d854..3725eb091 100644
--- a/include/triton/tools/graph.h
+++ b/include/triton/tools/graph.h
@@ -3,16 +3,15 @@
 #ifndef _TRITON_TOOLS_THREAD_GRAPH_H_
 #define _TRITON_TOOLS_THREAD_GRAPH_H_
 
+#include <iostream>
 #include <map>
 #include <set>
 #include <vector>
-#include <iostream>
 
 namespace triton {
-namespace tools{
+namespace tools {
 
-template<class node_t>
-class graph {
+template <class node_t> class graph {
   typedef std::map<node_t, std::set<node_t>> edges_t;
 
 public:
@@ -21,27 +20,27 @@ public:
 
 private:
   void connected_components_impl(node_t x, std::set<node_t> &nodes,
-                                 nmap_t* nmap, cmap_t* cmap, int id) const {
-    if(nmap)
+                                 nmap_t *nmap, cmap_t *cmap, int id) const {
+    if (nmap)
       (*nmap)[x] = id;
-    if(cmap)
+    if (cmap)
       (*cmap)[id].push_back(x);
-    if(nodes.find(x) != nodes.end()) {
+    if (nodes.find(x) != nodes.end()) {
       nodes.erase(x);
-      for(const node_t &y: edges_.at(x))
+      for (const node_t &y : edges_.at(x))
         connected_components_impl(y, nodes, nmap, cmap, id);
     }
   }
 
 public:
   void connected_components(cmap_t *cmap, nmap_t *nmap) const {
-    if(cmap)
+    if (cmap)
       cmap->clear();
-    if(nmap)
+    if (nmap)
       nmap->clear();
     std::set<node_t> nodes = nodes_;
     unsigned id = 0;
-    while(!nodes.empty()){
+    while (!nodes.empty()) {
       connected_components_impl(*nodes.begin(), nodes, nmap, cmap, id++);
     }
   }
@@ -63,7 +62,7 @@ private:
   edges_t edges_;
 };
 
-}
-}
+} // namespace tools
+} // namespace triton
 
 #endif
diff --git a/include/triton/tools/sha1.hpp b/include/triton/tools/sha1.hpp
index 630a3fd77..1e71034de 100644
--- a/include/triton/tools/sha1.hpp
+++ b/include/triton/tools/sha1.hpp
@@ -33,154 +33,140 @@
 #ifndef _TRITON_TOOLS_SHA1_HPP_
 #define _TRITON_TOOLS_SHA1_HPP_
 
-namespace sha1
+namespace sha1 {
+namespace // local
 {
-    namespace // local
-    {
-        // Rotate an integer value to left.
-        inline unsigned int rol(const unsigned int value,
-                const unsigned int steps)
-        {
-            return ((value << steps) | (value >> (32 - steps)));
-        }
+// Rotate an integer value to left.
+inline unsigned int rol(const unsigned int value, const unsigned int steps) {
+  return ((value << steps) | (value >> (32 - steps)));
+}
 
-        // Sets the first 16 integers in the buffert to zero.
-        // Used for clearing the W buffert.
-        inline void clearWBuffert(unsigned int* buffert)
-        {
-            for (int pos = 16; --pos >= 0;)
-            {
-                buffert[pos] = 0;
-            }
-        }
+// Sets the first 16 integers in the buffert to zero.
+// Used for clearing the W buffert.
+inline void clearWBuffert(unsigned int *buffert) {
+  for (int pos = 16; --pos >= 0;) {
+    buffert[pos] = 0;
+  }
+}
 
-        inline void innerHash(unsigned int* result, unsigned int* w)
-        {
-            unsigned int a = result[0];
-            unsigned int b = result[1];
-            unsigned int c = result[2];
-            unsigned int d = result[3];
-            unsigned int e = result[4];
+inline void innerHash(unsigned int *result, unsigned int *w) {
+  unsigned int a = result[0];
+  unsigned int b = result[1];
+  unsigned int c = result[2];
+  unsigned int d = result[3];
+  unsigned int e = result[4];
 
-            int round = 0;
+  int round = 0;
 
-            #define sha1macro(func,val) \
-      { \
-                const unsigned int t = rol(a, 5) + (func) + e + val + w[round]; \
-        e = d; \
-        d = c; \
-        c = rol(b, 30); \
-        b = a; \
-        a = t; \
-      }
+#define sha1macro(func, val)                                                   \
+  {                                                                            \
+    const unsigned int t = rol(a, 5) + (func) + e + val + w[round];            \
+    e = d;                                                                     \
+    d = c;                                                                     \
+    c = rol(b, 30);                                                            \
+    b = a;                                                                     \
+    a = t;                                                                     \
+  }
 
-            while (round < 16)
-            {
-                sha1macro((b & c) | (~b & d), 0x5a827999)
-                ++round;
-            }
-            while (round < 20)
-            {
-                w[round] = rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-                sha1macro((b & c) | (~b & d), 0x5a827999)
-                ++round;
-            }
-            while (round < 40)
-            {
-                w[round] = rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-                sha1macro(b ^ c ^ d, 0x6ed9eba1)
-                ++round;
-            }
-            while (round < 60)
-            {
-                w[round] = rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-                sha1macro((b & c) | (b & d) | (c & d), 0x8f1bbcdc)
-                ++round;
-            }
-            while (round < 80)
-            {
-                w[round] = rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-                sha1macro(b ^ c ^ d, 0xca62c1d6)
-                ++round;
-            }
+  while (round < 16) {
+    sha1macro((b & c) | (~b & d), 0x5a827999)++ round;
+  }
+  while (round < 20) {
+    w[round] =
+        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
+    sha1macro((b & c) | (~b & d), 0x5a827999)++ round;
+  }
+  while (round < 40) {
+    w[round] =
+        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
+    sha1macro(b ^ c ^ d, 0x6ed9eba1)++ round;
+  }
+  while (round < 60) {
+    w[round] =
+        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
+    sha1macro((b & c) | (b & d) | (c & d), 0x8f1bbcdc)++ round;
+  }
+  while (round < 80) {
+    w[round] =
+        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
+    sha1macro(b ^ c ^ d, 0xca62c1d6)++ round;
+  }
 
-            #undef sha1macro
+#undef sha1macro
 
-            result[0] += a;
-            result[1] += b;
-            result[2] += c;
-            result[3] += d;
-            result[4] += e;
-        }
-    } // namespace
+  result[0] += a;
+  result[1] += b;
+  result[2] += c;
+  result[3] += d;
+  result[4] += e;
+}
+} // namespace
 
-    inline void calc(const void* src, const int bytelength, unsigned char* hash)
-    {
-        // Init the result array.
-        unsigned int result[5] = { 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0 };
+inline void calc(const void *src, const int bytelength, unsigned char *hash) {
+  // Init the result array.
+  unsigned int result[5] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476,
+                            0xc3d2e1f0};
 
-        // Cast the void src pointer to be the byte array we can work with.
-        const unsigned char* sarray = (const unsigned char*) src;
+  // Cast the void src pointer to be the byte array we can work with.
+  const unsigned char *sarray = (const unsigned char *)src;
 
-        // The reusable round buffer
-        unsigned int w[80];
+  // The reusable round buffer
+  unsigned int w[80];
 
-        // Loop through all complete 64byte blocks.
-        const int endOfFullBlocks = bytelength - 64;
-        int endCurrentBlock;
-        int currentBlock = 0;
+  // Loop through all complete 64byte blocks.
+  const int endOfFullBlocks = bytelength - 64;
+  int endCurrentBlock;
+  int currentBlock = 0;
 
-        while (currentBlock <= endOfFullBlocks)
-        {
-            endCurrentBlock = currentBlock + 64;
+  while (currentBlock <= endOfFullBlocks) {
+    endCurrentBlock = currentBlock + 64;
 
-            // Init the round buffer with the 64 byte block data.
-            for (int roundPos = 0; currentBlock < endCurrentBlock; currentBlock += 4)
-            {
-                // This line will swap endian on big endian and keep endian on little endian.
-                w[roundPos++] = (unsigned int) sarray[currentBlock + 3]
-                        | (((unsigned int) sarray[currentBlock + 2]) << 8)
-                        | (((unsigned int) sarray[currentBlock + 1]) << 16)
-                        | (((unsigned int) sarray[currentBlock]) << 24);
-            }
-            innerHash(result, w);
-        }
-
-        // Handle the last and not full 64 byte block if existing.
-        endCurrentBlock = bytelength - currentBlock;
-        clearWBuffert(w);
-        int lastBlockBytes = 0;
-        for (;lastBlockBytes < endCurrentBlock; ++lastBlockBytes)
-        {
-            w[lastBlockBytes >> 2] |= (unsigned int) sarray[lastBlockBytes + currentBlock] << ((3 - (lastBlockBytes & 3)) << 3);
-        }
-        w[lastBlockBytes >> 2] |= 0x80 << ((3 - (lastBlockBytes & 3)) << 3);
-        if (endCurrentBlock >= 56)
-        {
-            innerHash(result, w);
-            clearWBuffert(w);
-        }
-        w[15] = bytelength << 3;
-        innerHash(result, w);
-
-        // Store hash in result pointer, and make sure we get in in the correct order on both endian models.
-        for (int hashByte = 20; --hashByte >= 0;)
-        {
-            hash[hashByte] = (result[hashByte >> 2] >> (((3 - hashByte) & 0x3) << 3)) & 0xff;
-        }
+    // Init the round buffer with the 64 byte block data.
+    for (int roundPos = 0; currentBlock < endCurrentBlock; currentBlock += 4) {
+      // This line will swap endian on big endian and keep endian on little
+      // endian.
+      w[roundPos++] = (unsigned int)sarray[currentBlock + 3] |
+                      (((unsigned int)sarray[currentBlock + 2]) << 8) |
+                      (((unsigned int)sarray[currentBlock + 1]) << 16) |
+                      (((unsigned int)sarray[currentBlock]) << 24);
     }
+    innerHash(result, w);
+  }
 
-    inline void toHexString(const unsigned char* hash, char* hexstring)
-    {
-        const char hexDigits[] = { "0123456789abcdef" };
+  // Handle the last and not full 64 byte block if existing.
+  endCurrentBlock = bytelength - currentBlock;
+  clearWBuffert(w);
+  int lastBlockBytes = 0;
+  for (; lastBlockBytes < endCurrentBlock; ++lastBlockBytes) {
+    w[lastBlockBytes >> 2] |=
+        (unsigned int)sarray[lastBlockBytes + currentBlock]
+        << ((3 - (lastBlockBytes & 3)) << 3);
+  }
+  w[lastBlockBytes >> 2] |= 0x80 << ((3 - (lastBlockBytes & 3)) << 3);
+  if (endCurrentBlock >= 56) {
+    innerHash(result, w);
+    clearWBuffert(w);
+  }
+  w[15] = bytelength << 3;
+  innerHash(result, w);
 
-        for (int hashByte = 20; --hashByte >= 0;)
-        {
-            hexstring[hashByte << 1] = hexDigits[(hash[hashByte] >> 4) & 0xf];
-            hexstring[(hashByte << 1) + 1] = hexDigits[hash[hashByte] & 0xf];
-        }
-        hexstring[40] = 0;
-    }
+  // Store hash in result pointer, and make sure we get in in the correct order
+  // on both endian models.
+  for (int hashByte = 20; --hashByte >= 0;) {
+    hash[hashByte] =
+        (result[hashByte >> 2] >> (((3 - hashByte) & 0x3) << 3)) & 0xff;
+  }
+}
+
+inline void toHexString(const unsigned char *hash, char *hexstring) {
+  const char hexDigits[] = {"0123456789abcdef"};
+
+  for (int hashByte = 20; --hashByte >= 0;) {
+    hexstring[hashByte << 1] = hexDigits[(hash[hashByte] >> 4) & 0xf];
+    hexstring[(hashByte << 1) + 1] = hexDigits[hash[hashByte] & 0xf];
+  }
+  hexstring[40] = 0;
+}
 } // namespace sha1
 
 #endif
diff --git a/include/triton/tools/sys/exec.hpp b/include/triton/tools/sys/exec.hpp
index 5b664553e..e96a04314 100644
--- a/include/triton/tools/sys/exec.hpp
+++ b/include/triton/tools/sys/exec.hpp
@@ -7,11 +7,8 @@
 #include <stdexcept>
 #include <string>
 
-namespace triton
-{
-namespace tools
-{
-
+namespace triton {
+namespace tools {
 
 #ifdef _WIN32
 #define popen _popen
@@ -19,12 +16,12 @@ namespace tools
 #endif
 
 #ifndef WEXITSTATUS
-#define WEXITSTATUS(stat_val) ((unsigned)(stat_val) & 255)
+#define WEXITSTATUS(stat_val) ((unsigned)(stat_val)&255)
 #endif
 
-int exec(const std::string& cmd, std::string& result) {
+int exec(const std::string &cmd, std::string &result) {
   char buffer[128];
-  FILE* pipe = popen(cmd.c_str(), "r");
+  FILE *pipe = popen(cmd.c_str(), "r");
   if (!pipe)
     return 0;
   result.clear();
@@ -37,10 +34,9 @@ int exec(const std::string& cmd, std::string& result) {
   }
   int status = pclose(pipe);
   return WEXITSTATUS(status);
-
 }
 
-}
-}
+} // namespace tools
+} // namespace triton
 
 #endif
diff --git a/include/triton/tools/sys/getenv.hpp b/include/triton/tools/sys/getenv.hpp
old mode 100755
new mode 100644
index 755a84a66..1f1c57521
--- a/include/triton/tools/sys/getenv.hpp
+++ b/include/triton/tools/sys/getenv.hpp
@@ -22,26 +22,23 @@
 #ifndef TDL_TOOLS_SYS_GETENV_HPP
 #define TDL_TOOLS_SYS_GETENV_HPP
 
-#include <string>
 #include <cstdlib>
+#include <string>
 
-namespace triton
-{
+namespace triton {
 
-namespace tools
-{
-
-    inline std::string getenv(const char * name)
-    {
-        const char * cstr = std::getenv(name);
-        if(!cstr)
-            return "";
-        std::string result(cstr);
-        return result;
-    }
+namespace tools {
 
+inline std::string getenv(const char *name) {
+  const char *cstr = std::getenv(name);
+  if (!cstr)
+    return "";
+  std::string result(cstr);
+  return result;
 }
 
-}
+} // namespace tools
+
+} // namespace triton
 
 #endif
diff --git a/include/triton/tools/sys/mkdir.hpp b/include/triton/tools/sys/mkdir.hpp
old mode 100755
new mode 100644
index 5198a0098..10cb0da6a
--- a/include/triton/tools/sys/mkdir.hpp
+++ b/include/triton/tools/sys/mkdir.hpp
@@ -22,55 +22,49 @@
 #ifndef TDL_TOOLS_SYS_MKDIR_HPP
 #define TDL_TOOLS_SYS_MKDIR_HPP
 
-#include <cstring>
-#include <string>
 #include <cstdlib>
-#include <sys/stat.h>
+#include <cstring>
 #include <errno.h>
+#include <string>
+#include <sys/stat.h>
 #if defined(_WIN32)
-  #include <direct.h>
+#include <direct.h>
 #endif
 
-namespace triton
-{
+namespace triton {
 
-namespace tools
-{
-
-    inline int mkdir(std::string const & path)
-    {
-        #if defined(_WIN32)
-            return _mkdir(path.c_str());
-        #else
-            return ::mkdir(path.c_str(), 0777);
-        #endif
-    }
-
-    inline int mkpath(std::string const & path)
-    {
-        int status = 0;
-        size_t pp = 0;
-        size_t sp;
-        while ((sp = path.find('/', pp)) != std::string::npos)
-        {
-            if (sp != pp){
-                status = mkdir(path.substr(0, sp));
-            }
-            pp = sp + 1;
-        }
-        return (status==0 || errno==EEXIST)?0:-1;
-    }
-
-    inline int mtime(std::string const & path)
-    {
-      struct stat st;
-      if(stat(path.c_str(), &st) != 0)
-        return 0;
-      return st.st_mtime;
-    }
+namespace tools {
 
+inline int mkdir(std::string const &path) {
+#if defined(_WIN32)
+  return _mkdir(path.c_str());
+#else
+  return ::mkdir(path.c_str(), 0777);
+#endif
 }
 
+inline int mkpath(std::string const &path) {
+  int status = 0;
+  size_t pp = 0;
+  size_t sp;
+  while ((sp = path.find('/', pp)) != std::string::npos) {
+    if (sp != pp) {
+      status = mkdir(path.substr(0, sp));
+    }
+    pp = sp + 1;
+  }
+  return (status == 0 || errno == EEXIST) ? 0 : -1;
 }
 
+inline int mtime(std::string const &path) {
+  struct stat st;
+  if (stat(path.c_str(), &st) != 0)
+    return 0;
+  return st.st_mtime;
+}
+
+} // namespace tools
+
+} // namespace triton
+
 #endif
diff --git a/include/triton/tools/thread_pool.h b/include/triton/tools/thread_pool.h
index fbcf2b684..e8a6ca6ca 100644
--- a/include/triton/tools/thread_pool.h
+++ b/include/triton/tools/thread_pool.h
@@ -3,88 +3,79 @@
 #ifndef _TRITON_TOOLS_THREAD_POOL_H_
 #define _TRITON_TOOLS_THREAD_POOL_H_
 
-#include <vector>
-#include <queue>
-#include <memory>
-#include <thread>
-#include <mutex>
 #include <condition_variable>
-#include <future>
 #include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
 #include <stdexcept>
+#include <thread>
+#include <vector>
 
 class ThreadPool {
 public:
-    ThreadPool(size_t threads)
-        :   stop(false) {
-      for(size_t i = 0;i < threads;++i)
-          workers.emplace_back(
-              [this] {
-                for(;;){
-                  std::function<void()> task;
-                  {
-                    std::unique_lock<std::mutex> lock(this->queue_mutex);
-                    this->condition.wait(lock,
-                      [this]{ return this->stop || !this->tasks.empty(); });
-                    if(this->stop && this->tasks.empty())
-                      return;
-                    task = std::move(this->tasks.front());
-                    this->tasks.pop();
-                  }
-                  task();
-                }
-              }
-          );
-    }
+  ThreadPool(size_t threads) : stop(false) {
+    for (size_t i = 0; i < threads; ++i)
+      workers.emplace_back([this] {
+        for (;;) {
+          std::function<void()> task;
+          {
+            std::unique_lock<std::mutex> lock(this->queue_mutex);
+            this->condition.wait(
+                lock, [this] { return this->stop || !this->tasks.empty(); });
+            if (this->stop && this->tasks.empty())
+              return;
+            task = std::move(this->tasks.front());
+            this->tasks.pop();
+          }
+          task();
+        }
+      });
+  }
 
+  template <class F, class... Args>
+  auto enqueue(F &&f, Args &&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type> {
+    using return_type = typename std::result_of<F(Args...)>::type;
 
-    template<class F, class... Args>
-    auto enqueue(F&& f, Args&&... args)
-        -> std::future<typename std::result_of<F(Args...)>::type>
+    auto task = std::make_shared<std::packaged_task<return_type()>>(
+        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+
+    std::future<return_type> res = task->get_future();
     {
-        using return_type = typename std::result_of<F(Args...)>::type;
+      std::unique_lock<std::mutex> lock(queue_mutex);
 
-        auto task = std::make_shared< std::packaged_task<return_type()> >(
-                std::bind(std::forward<F>(f), std::forward<Args>(args)...)
-            );
+      // don't allow enqueueing after stopping the pool
+      if (stop)
+        throw std::runtime_error("enqueue on stopped ThreadPool");
 
-        std::future<return_type> res = task->get_future();
-        {
-            std::unique_lock<std::mutex> lock(queue_mutex);
-
-            // don't allow enqueueing after stopping the pool
-            if(stop)
-                throw std::runtime_error("enqueue on stopped ThreadPool");
-
-            tasks.emplace([task](){ (*task)(); });
-        }
-        condition.notify_one();
-        return res;
+      tasks.emplace([task]() { (*task)(); });
     }
+    condition.notify_one();
+    return res;
+  }
 
-
-    ~ThreadPool() {
-        {
-          std::unique_lock<std::mutex> lock(queue_mutex);
-          stop = true;
-        }
-        condition.notify_all();
-        for(std::thread &worker: workers)
-          worker.join();
+  ~ThreadPool() {
+    {
+      std::unique_lock<std::mutex> lock(queue_mutex);
+      stop = true;
     }
-
+    condition.notify_all();
+    for (std::thread &worker : workers)
+      worker.join();
+  }
 
 private:
-    // need to keep track of threads so we can join them
-    std::vector< std::thread > workers;
-    // the task queue
-    std::queue< std::function<void()> > tasks;
+  // need to keep track of threads so we can join them
+  std::vector<std::thread> workers;
+  // the task queue
+  std::queue<std::function<void()>> tasks;
 
-    // synchronization
-    std::mutex queue_mutex;
-    std::condition_variable condition;
-    bool stop;
+  // synchronization
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+  bool stop;
 };
 
-
 #endif
diff --git a/lib/Analysis/AxisInfo.cpp b/lib/Analysis/AxisInfo.cpp
index 6222e5261..ef926c190 100644
--- a/lib/Analysis/AxisInfo.cpp
+++ b/lib/Analysis/AxisInfo.cpp
@@ -8,24 +8,23 @@
 
 namespace mlir {
 
-
 //===----------------------------------------------------------------------===//
 // AxisInfo
 //===----------------------------------------------------------------------===//
 
- // Function for extended Euclidean Algorithm
-static int gcd_impl(int a, int b, int *x, int *y){
+// Function for extended Euclidean Algorithm
+static int gcd_impl(int a, int b, int *x, int *y) {
   // Base Case
   if (a == 0) {
-      *x = 0;
-      *y = 1;
-      return b;
+    *x = 0;
+    *y = 1;
+    return b;
   }
   int x1, y1; // To store results of recursive call
-  int gcd = gcd_impl(b%a, a, &x1, &y1);
+  int gcd = gcd_impl(b % a, a, &x1, &y1);
   // Update x and y using results of
   // recursive call
-  *x = y1 - (b/a) * x1;
+  *x = y1 - (b / a) * x1;
   *y = x1;
   return gcd;
 }
@@ -35,17 +34,17 @@ static int gcd(int a, int b) {
   return gcd_impl(a, b, &x, &y);
 }
 
-
 AxisInfo AxisInfo::getPessimisticValueState(Value value) {
   size_t rank = 1;
-  if(TensorType ty = value.getType().dyn_cast<TensorType>())
+  if (TensorType ty = value.getType().dyn_cast<TensorType>())
     rank = ty.getRank();
   int divHint = 1;
-  if(BlockArgument blockArg = value.dyn_cast<BlockArgument>()){
-    Operation* op = blockArg.getOwner()->getParentOp();
-    if(FuncOp fun = dyn_cast<FuncOp>(op)){
-      Attribute attr = fun.getArgAttr(blockArg.getArgNumber(), "tt.divisibility");
-      if(attr)
+  if (BlockArgument blockArg = value.dyn_cast<BlockArgument>()) {
+    Operation *op = blockArg.getOwner()->getParentOp();
+    if (FuncOp fun = dyn_cast<FuncOp>(op)) {
+      Attribute attr =
+          fun.getArgAttr(blockArg.getArgNumber(), "tt.divisibility");
+      if (attr)
         divHint = attr.cast<IntegerAttr>().getValue().getZExtValue();
     }
   }
@@ -55,51 +54,51 @@ AxisInfo AxisInfo::getPessimisticValueState(Value value) {
   return AxisInfo(contiguity, divisibility, constancy);
 }
 
-
 // The gcd of both arguments for each dimension
-AxisInfo AxisInfo::join(const AxisInfo &lhs,
-                        const AxisInfo &rhs) {
+AxisInfo AxisInfo::join(const AxisInfo &lhs, const AxisInfo &rhs) {
   ContiguityT retContiguity;
   DivisibilityT retDivisibility;
   ConstancyT retConstancy;
-  for(size_t d = 0; d < lhs.getRank(); d++){
+  for (size_t d = 0; d < lhs.getRank(); d++) {
     retContiguity.push_back(gcd(lhs.getContiguity(d), rhs.getContiguity(d)));
-    retDivisibility.push_back(gcd(lhs.getDivisibility(d), rhs.getDivisibility(d)));
+    retDivisibility.push_back(
+        gcd(lhs.getDivisibility(d), rhs.getDivisibility(d)));
     retConstancy.push_back(gcd(lhs.getConstancy(d), rhs.getConstancy(d)));
   }
   return AxisInfo(retContiguity, retDivisibility, retConstancy);
 }
 
-
 //===----------------------------------------------------------------------===//
 // AxisInfoAnalysis
 //===----------------------------------------------------------------------===//
 
-AxisInfo AxisInfoAnalysis::visitBinaryOp(Operation* op, AxisInfo lhsInfo, AxisInfo rhsInfo,
-                        const std::function<int(AxisInfo,AxisInfo,int)>& getContiguity,
-                        const std::function<int(AxisInfo,AxisInfo,int)>& getDivisibility,
-                        const std::function<int(AxisInfo,AxisInfo,int)>& getConstancy) {
-    int rank = lhsInfo.getRank();
-    AxisInfo::ContiguityT newContiguity;
-    AxisInfo::DivisibilityT newDivisibility;
-    AxisInfo::ConstancyT newConstancy;
-    for(size_t d = 0; d < rank; d++){
-      newContiguity.push_back(getContiguity(lhsInfo, rhsInfo, d));
-      newDivisibility.push_back(getDivisibility(lhsInfo, rhsInfo, d));
-      newConstancy.push_back(getConstancy(lhsInfo, rhsInfo, d));
-    }
-    return AxisInfo(newContiguity, newDivisibility, newConstancy);
+AxisInfo AxisInfoAnalysis::visitBinaryOp(
+    Operation *op, AxisInfo lhsInfo, AxisInfo rhsInfo,
+    const std::function<int(AxisInfo, AxisInfo, int)> &getContiguity,
+    const std::function<int(AxisInfo, AxisInfo, int)> &getDivisibility,
+    const std::function<int(AxisInfo, AxisInfo, int)> &getConstancy) {
+  int rank = lhsInfo.getRank();
+  AxisInfo::ContiguityT newContiguity;
+  AxisInfo::DivisibilityT newDivisibility;
+  AxisInfo::ConstancyT newConstancy;
+  for (size_t d = 0; d < rank; d++) {
+    newContiguity.push_back(getContiguity(lhsInfo, rhsInfo, d));
+    newDivisibility.push_back(getDivisibility(lhsInfo, rhsInfo, d));
+    newConstancy.push_back(getConstancy(lhsInfo, rhsInfo, d));
+  }
+  return AxisInfo(newContiguity, newDivisibility, newConstancy);
 }
 
-ChangeResult AxisInfoAnalysis::visitOperation(Operation *op,
-                    ArrayRef<LatticeElement<AxisInfo> *> operands) {
+ChangeResult AxisInfoAnalysis::visitOperation(
+    Operation *op, ArrayRef<LatticeElement<AxisInfo> *> operands) {
   AxisInfo curr;
   // This preserves the input axes (e.g., cast):
   if (llvm::isa<arith::ExtSIOp, arith::ExtUIOp, arith::TruncIOp,
                 triton::PtrToIntOp, triton::IntToPtrOp>(op))
     curr = operands[0]->getValue();
   // Constant ranges
-  if (triton::MakeRangeOp make_range = llvm::dyn_cast<triton::MakeRangeOp>(op)){
+  if (triton::MakeRangeOp make_range =
+          llvm::dyn_cast<triton::MakeRangeOp>(op)) {
     int start = make_range.start();
     int end = make_range.end();
     AxisInfo::ContiguityT contiguity = {end - start};
@@ -108,61 +107,59 @@ ChangeResult AxisInfoAnalysis::visitOperation(Operation *op,
     curr = AxisInfo(contiguity, divisibility, constancy);
   }
   // Constant
-  if (arith::ConstantOp constant = llvm::dyn_cast<arith::ConstantOp>(op)){
+  if (arith::ConstantOp constant = llvm::dyn_cast<arith::ConstantOp>(op)) {
     auto intAttr = constant.getValue().dyn_cast<IntegerAttr>();
-    if(intAttr){
+    if (intAttr) {
       size_t val = intAttr.getValue().getZExtValue();
       curr = AxisInfo({1}, {highestPowOf2Divisor(val)}, {1});
     }
     // TODO: generalize to dense attr
     auto splatAttr = constant.getValue().dyn_cast<SplatElementsAttr>();
-    if(splatAttr && splatAttr.getElementType().isInteger(32)){
+    if (splatAttr && splatAttr.getElementType().isInteger(32)) {
       auto value = splatAttr.getSplatValue<int>();
       TensorType ty = splatAttr.getType().cast<TensorType>();
-      curr = AxisInfo(AxisInfo::ContiguityT(ty.getRank(), 1),
-                      AxisInfo::DivisibilityT(ty.getRank(), highestPowOf2Divisor(value)),
-                      AxisInfo::ConstancyT(ty.getShape().begin(), ty.getShape().end()));
-
+      curr = AxisInfo(
+          AxisInfo::ContiguityT(ty.getRank(), 1),
+          AxisInfo::DivisibilityT(ty.getRank(), highestPowOf2Divisor(value)),
+          AxisInfo::ConstancyT(ty.getShape().begin(), ty.getShape().end()));
     }
   }
   // Addition
-  if (llvm::isa<arith::AddIOp, triton::GEPOp>(op)){
-    auto newContiguity = [&](AxisInfo lhs, AxisInfo rhs, int d){
+  if (llvm::isa<arith::AddIOp, triton::GEPOp>(op)) {
+    auto newContiguity = [&](AxisInfo lhs, AxisInfo rhs, int d) {
       return std::max(gcd(lhs.getContiguity(d), rhs.getConstancy(d)),
                       gcd(lhs.getConstancy(d), rhs.getContiguity(d)));
     };
-    auto newConstancy = [&](AxisInfo lhs, AxisInfo rhs, int d){
+    auto newConstancy = [&](AxisInfo lhs, AxisInfo rhs, int d) {
       return gcd(lhs.getConstancy(d), rhs.getConstancy(d));
     };
-    auto newDivisibility = [&](AxisInfo lhs, AxisInfo rhs, int d){
+    auto newDivisibility = [&](AxisInfo lhs, AxisInfo rhs, int d) {
       return gcd(lhs.getDivisibility(d), rhs.getDivisibility(d));
     };
     curr = visitBinaryOp(op, operands[0]->getValue(), operands[1]->getValue(),
-                          newContiguity, newDivisibility, newConstancy);
+                         newContiguity, newDivisibility, newConstancy);
   }
   // Multiplication
-  if (llvm::isa<arith::MulIOp>(op)){
-    auto newContiguity = [](AxisInfo lhs, AxisInfo rhs, int d){ 
-      return 1; 
-    };
-    auto newConstancy = [](AxisInfo lhs, AxisInfo rhs, int d){
+  if (llvm::isa<arith::MulIOp>(op)) {
+    auto newContiguity = [](AxisInfo lhs, AxisInfo rhs, int d) { return 1; };
+    auto newConstancy = [](AxisInfo lhs, AxisInfo rhs, int d) {
       return gcd(lhs.getConstancy(d), rhs.getConstancy(d));
     };
-    auto newDivisibility = [](AxisInfo lhs, AxisInfo rhs, int d){
-      return lhs.getDivisibility(d)*rhs.getDivisibility(d);
+    auto newDivisibility = [](AxisInfo lhs, AxisInfo rhs, int d) {
+      return lhs.getDivisibility(d) * rhs.getDivisibility(d);
     };
     curr = visitBinaryOp(op, operands[0]->getValue(), operands[1]->getValue(),
-                          newContiguity, newDivisibility, newConstancy);
+                         newContiguity, newDivisibility, newConstancy);
   }
   // Splat
-  if (llvm::isa<triton::SplatOp>(op)){
+  if (llvm::isa<triton::SplatOp>(op)) {
     Type _retTy = *op->result_type_begin();
     TensorType retTy = _retTy.cast<TensorType>();
     AxisInfo opInfo = operands[0]->getValue();
     AxisInfo::ContiguityT contiguity;
     AxisInfo::DivisibilityT divisibility;
     AxisInfo::ConstancyT constancy;
-    for(size_t d = 0; d < retTy.getRank(); d++){
+    for (size_t d = 0; d < retTy.getRank(); d++) {
       contiguity.push_back(1);
       divisibility.push_back(opInfo.getDivisibility(0));
       constancy.push_back(retTy.getShape()[d]);
@@ -171,7 +168,7 @@ ChangeResult AxisInfoAnalysis::visitOperation(Operation *op,
   }
   // Reshape
   // TODO: Replace by `unsqueeze`
-  if (llvm::isa<triton::ReshapeOp>(op)){
+  if (llvm::isa<triton::ReshapeOp>(op)) {
     Type _retTy = *op->result_type_begin();
     Type _opTy = *op->operand_type_begin();
     TensorType retTy = _retTy.cast<TensorType>();
@@ -184,20 +181,17 @@ ChangeResult AxisInfoAnalysis::visitOperation(Operation *op,
     AxisInfo::ConstancyT constancy;
     bool is_skewed = false;
     size_t current = 0;
-    for(size_t d = 0; d < retTy.getRank(); d++){
-      if(retShape[d] == 1){
+    for (size_t d = 0; d < retTy.getRank(); d++) {
+      if (retShape[d] == 1) {
         contiguity.push_back(1);
         divisibility.push_back(1);
         constancy.push_back(1);
-      }
-      else if(!is_skewed
-              && retShape[d] == opShape[current]){
+      } else if (!is_skewed && retShape[d] == opShape[current]) {
         contiguity.push_back(opInfo.getContiguity()[current]);
         divisibility.push_back(opInfo.getDivisibility()[current]);
         constancy.push_back(opInfo.getConstancy()[current]);
         current++;
-      }
-      else {
+      } else {
         is_skewed = true;
         contiguity.push_back(1);
         divisibility.push_back(1);
@@ -207,7 +201,7 @@ ChangeResult AxisInfoAnalysis::visitOperation(Operation *op,
     curr = AxisInfo(contiguity, divisibility, constancy);
   }
   // Broadcast
-  if (llvm::isa<triton::BroadcastOp>(op)){
+  if (llvm::isa<triton::BroadcastOp>(op)) {
     Type _retTy = *op->result_type_begin();
     Type _opTy = *op->operand_type_begin();
     TensorType retTy = _retTy.cast<TensorType>();
@@ -218,14 +212,14 @@ ChangeResult AxisInfoAnalysis::visitOperation(Operation *op,
     AxisInfo::ContiguityT contiguity;
     AxisInfo::DivisibilityT divisibility;
     AxisInfo::ConstancyT constancy;
-    for(size_t d = 0; d < retTy.getRank(); d++){
+    for (size_t d = 0; d < retTy.getRank(); d++) {
       contiguity.push_back(opShape[d] == 1 ? 1 : opInfo.getContiguity(d));
       divisibility.push_back(opInfo.getDivisibility(d));
       constancy.push_back(opShape[d] == 1 ? retShape[d] : 1);
     }
     curr = AxisInfo(contiguity, divisibility, constancy);
   }
-  if(curr.getRank() == 0){
+  if (curr.getRank() == 0) {
     return markAllPessimisticFixpoint(op->getResults());
   }
   // join all latice elements
@@ -236,4 +230,4 @@ ChangeResult AxisInfoAnalysis::visitOperation(Operation *op,
   return result;
 }
 
-}
\ No newline at end of file
+} // namespace mlir
\ No newline at end of file
diff --git a/lib/Conversion/PassDetail.h b/lib/Conversion/PassDetail.h
index e772f41b6..e60f15a84 100644
--- a/lib/Conversion/PassDetail.h
+++ b/lib/Conversion/PassDetail.h
@@ -2,14 +2,16 @@
 #define TRITON_CONVERSION_PASSDETAIL_H
 
 #include "mlir/Pass/Pass.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
-namespace mlir{
-namespace triton{
+namespace mlir {
+namespace triton {
 
 #define GEN_PASS_CLASSES
 #include "triton/Conversion/Passes.h.inc"
 
-}
-}
+} // namespace triton
+} // namespace mlir
 
 #endif
diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPU.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPU.cpp
index 22663b504..c927e766d 100644
--- a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPU.cpp
+++ b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPU.cpp
@@ -1,42 +1,42 @@
+#include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h"
+#include "../PassDetail.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h"
 #include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
-#include "../PassDetail.h"
 
 using namespace mlir;
 using namespace mlir::triton;
 
 namespace {
 
-template<class Op>
-class ArithGenericPattern : public OpConversionPattern<Op> {
+template <class Op> class ArithGenericPattern : public OpConversionPattern<Op> {
 public:
   using OpConversionPattern<Op>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(Op op, typename Op::Adaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(Op op, typename Op::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     Type retType = this->getTypeConverter()->convertType(op.getType());
-    Op res = rewriter.replaceOpWithNewOp<Op>(
-      op, retType, adaptor.getOperands()
-    );
+    Op res =
+        rewriter.replaceOpWithNewOp<Op>(op, retType, adaptor.getOperands());
     return success();
   }
 };
 
-template<class SrcOp, class DstOp>
+template <class SrcOp, class DstOp>
 class ArithCmpPattern : public OpConversionPattern<SrcOp> {
 public:
   using OpConversionPattern<SrcOp>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(SrcOp op, typename SrcOp::Adaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(SrcOp op, typename SrcOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     Type retType = this->getTypeConverter()->convertType(op.getType());
-    DstOp res = rewriter.replaceOpWithNewOp<DstOp>(
-      op, retType, adaptor.getPredicate(), adaptor.getLhs(), adaptor.getRhs()
-    );
+    DstOp res =
+        rewriter.replaceOpWithNewOp<DstOp>(op, retType, adaptor.getPredicate(),
+                                           adaptor.getLhs(), adaptor.getRhs());
     return success();
   }
 };
@@ -45,36 +45,40 @@ class ArithConstantPattern : public OpConversionPattern<arith::ConstantOp> {
 public:
   using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     Type retType = getTypeConverter()->convertType(op.getType());
     auto value = adaptor.getValue().dyn_cast<DenseElementsAttr>();
     assert(value);
     rewriter.replaceOpWithNewOp<arith::ConstantOp>(
-      op, retType, value.reshape(retType) // This is a hack. We just want to add encoding
+        op, retType,
+        value.reshape(retType) // This is a hack. We just want to add encoding
     );
     return success();
   }
 };
 
-class ConvertArithmeticOp: public ConversionPattern {
+class ConvertArithmeticOp : public ConversionPattern {
 public:
-    ConvertArithmeticOp(TritonGPUTypeConverter &typeConverter, MLIRContext *context)
-        : ConversionPattern(typeConverter, MatchAnyOpTypeTag(), /*benefit=*/1,
-                            context) {}
+  ConvertArithmeticOp(TritonGPUTypeConverter &typeConverter,
+                      MLIRContext *context)
+      : ConversionPattern(typeConverter, MatchAnyOpTypeTag(), /*benefit=*/1,
+                          context) {}
 
-    LogicalResult matchAndRewrite(Operation* op, ArrayRef<Value> operands,
-                                  ConversionPatternRewriter& rewriter) const override {
-        Dialect* dialect = op->getDialect();
-        if(dialect->getTypeID() != mlir::TypeID::get<arith::ArithmeticDialect>()) 
-            return failure();
-        return success();
-    }
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    Dialect *dialect = op->getDialect();
+    if (dialect->getTypeID() != mlir::TypeID::get<arith::ArithmeticDialect>())
+      return failure();
+    return success();
+  }
 };
 
 void populateArithmeticPatternsAndLegality(
-    TritonGPUTypeConverter& typeConverter, RewritePatternSet &patterns,
-    TritonGPUConversionTarget &target){
+    TritonGPUTypeConverter &typeConverter, RewritePatternSet &patterns,
+    TritonGPUConversionTarget &target) {
   // --------------
   // Add legality and rewrite pattern rules for operations
   // from the Arithmetic dialect. The basic premise is that
@@ -91,59 +95,49 @@ void populateArithmeticPatternsAndLegality(
   // );
   // Rewrite rule
   // patterns.add<ConvertArithmeticOp>(typeConverter, context);
-  patterns.add<ArithConstantPattern,
-               ArithGenericPattern<arith::AddIOp>,
-               ArithGenericPattern<arith::SubIOp>,
-               ArithGenericPattern<arith::MulIOp>,
-               ArithGenericPattern<arith::DivUIOp>,
-               ArithGenericPattern<arith::DivSIOp>,
-               ArithGenericPattern<arith::CeilDivUIOp>,
-               ArithGenericPattern<arith::CeilDivSIOp>,
-               ArithGenericPattern<arith::FloorDivSIOp>,
-               ArithGenericPattern<arith::RemUIOp>,
-               ArithGenericPattern<arith::RemSIOp>,
-               ArithGenericPattern<arith::AndIOp>,
-               ArithGenericPattern<arith::OrIOp>,
-               ArithGenericPattern<arith::XOrIOp>,
-               ArithGenericPattern<arith::ShLIOp>,
-               ArithGenericPattern<arith::ShRUIOp>,
-               ArithGenericPattern<arith::ShRSIOp>, // NegFOp
-               // Floating point
-               ArithGenericPattern<arith::AddFOp>,
-               ArithGenericPattern<arith::SubFOp>,
-               // MaxMin
-               ArithGenericPattern<arith::MaxFOp>,
-               ArithGenericPattern<arith::MaxSIOp>,
-               ArithGenericPattern<arith::MaxUIOp>,
-               ArithGenericPattern<arith::MinFOp>,
-               ArithGenericPattern<arith::MinSIOp>,
-               ArithGenericPattern<arith::MinUIOp>,
-               // Floating point
-               ArithGenericPattern<arith::MulFOp>,
-               ArithGenericPattern<arith::DivFOp>,
-               ArithGenericPattern<arith::RemFOp>,
-               // Cmp
-               ArithCmpPattern<arith::CmpIOp, triton::gpu::CmpIOp>,
-               ArithCmpPattern<arith::CmpFOp, triton::gpu::CmpFOp>,
-               // Cast Ops
-               ArithGenericPattern<arith::TruncIOp>,
-               ArithGenericPattern<arith::TruncFOp>
-              >(typeConverter, context);
+  patterns.add<
+      ArithConstantPattern, ArithGenericPattern<arith::AddIOp>,
+      ArithGenericPattern<arith::SubIOp>, ArithGenericPattern<arith::MulIOp>,
+      ArithGenericPattern<arith::DivUIOp>, ArithGenericPattern<arith::DivSIOp>,
+      ArithGenericPattern<arith::CeilDivUIOp>,
+      ArithGenericPattern<arith::CeilDivSIOp>,
+      ArithGenericPattern<arith::FloorDivSIOp>,
+      ArithGenericPattern<arith::RemUIOp>, ArithGenericPattern<arith::RemSIOp>,
+      ArithGenericPattern<arith::AndIOp>, ArithGenericPattern<arith::OrIOp>,
+      ArithGenericPattern<arith::XOrIOp>, ArithGenericPattern<arith::ShLIOp>,
+      ArithGenericPattern<arith::ShRUIOp>,
+      ArithGenericPattern<arith::ShRSIOp>, // NegFOp
+      // Floating point
+      ArithGenericPattern<arith::AddFOp>, ArithGenericPattern<arith::SubFOp>,
+      // MaxMin
+      ArithGenericPattern<arith::MaxFOp>, ArithGenericPattern<arith::MaxSIOp>,
+      ArithGenericPattern<arith::MaxUIOp>, ArithGenericPattern<arith::MinFOp>,
+      ArithGenericPattern<arith::MinSIOp>, ArithGenericPattern<arith::MinUIOp>,
+      // Floating point
+      ArithGenericPattern<arith::MulFOp>, ArithGenericPattern<arith::DivFOp>,
+      ArithGenericPattern<arith::RemFOp>,
+      // Cmp
+      ArithCmpPattern<arith::CmpIOp, triton::gpu::CmpIOp>,
+      ArithCmpPattern<arith::CmpFOp, triton::gpu::CmpFOp>,
+      // Cast Ops
+      ArithGenericPattern<arith::TruncIOp>,
+      ArithGenericPattern<arith::TruncFOp>>(typeConverter, context);
 }
 
 //
 // Triton patterns
 //
 // TODO: Do we need to put them in anonymous namespace?
-struct TritonMakeRangePattern : public OpConversionPattern<triton::MakeRangeOp> {
+struct TritonMakeRangePattern
+    : public OpConversionPattern<triton::MakeRangeOp> {
   using OpConversionPattern<triton::MakeRangeOp>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(triton::MakeRangeOp op, OpAdaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(triton::MakeRangeOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     Type retType = getTypeConverter()->convertType(op.getType());
     rewriter.replaceOpWithNewOp<triton::MakeRangeOp>(
-      op, retType, adaptor.start(), adaptor.end()
-    );
+        op, retType, adaptor.start(), adaptor.end());
     return success();
   }
 };
@@ -151,8 +145,9 @@ struct TritonMakeRangePattern : public OpConversionPattern<triton::MakeRangeOp>
 struct TritonDotPattern : public OpConversionPattern<triton::DotOp> {
   using OpConversionPattern<triton::DotOp>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(triton::DotOp op, OpAdaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(triton::DotOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     Type retType = getTypeConverter()->convertType(op.getType());
     // a & b must be of smem layout
     auto aType = adaptor.a().getType().cast<RankedTensorType>();
@@ -165,18 +160,21 @@ struct TritonDotPattern : public OpConversionPattern<triton::DotOp> {
     Value b = adaptor.b();
     SmallVector<unsigned, 2> order{1, 0};
     if (!aEncoding.isa<triton::gpu::TritonGPUSharedEncodingAttr>()) {
-      Attribute encoding = triton::gpu::TritonGPUSharedEncodingAttr::get(getContext(), 1, 1, 1, order);
-      auto dstType = RankedTensorType::get(aType.getShape(), aType.getElementType(), encoding);
+      Attribute encoding = triton::gpu::TritonGPUSharedEncodingAttr::get(
+          getContext(), 1, 1, 1, order);
+      auto dstType = RankedTensorType::get(aType.getShape(),
+                                           aType.getElementType(), encoding);
       a = rewriter.create<triton::gpu::ConvertLayoutOp>(a.getLoc(), dstType, a);
     }
     if (!bEncoding.isa<triton::gpu::TritonGPUSharedEncodingAttr>()) {
-      Attribute encoding = triton::gpu::TritonGPUSharedEncodingAttr::get(getContext(), 1, 1, 1, order);
-      auto dstType = RankedTensorType::get(bType.getShape(), bType.getElementType(), encoding);
+      Attribute encoding = triton::gpu::TritonGPUSharedEncodingAttr::get(
+          getContext(), 1, 1, 1, order);
+      auto dstType = RankedTensorType::get(bType.getShape(),
+                                           bType.getElementType(), encoding);
       b = rewriter.create<triton::gpu::ConvertLayoutOp>(b.getLoc(), dstType, b);
     }
     auto newDot = rewriter.replaceOpWithNewOp<triton::DotOp>(
-      op, retType, a, b, adaptor.c(), adaptor.allowTF32()
-    );
+        op, retType, a, b, adaptor.c(), adaptor.allowTF32());
     return success();
   }
 };
@@ -184,14 +182,13 @@ struct TritonDotPattern : public OpConversionPattern<triton::DotOp> {
 struct TritonLoadPattern : public OpConversionPattern<triton::LoadOp> {
   using OpConversionPattern<triton::LoadOp>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(triton::LoadOp op, OpAdaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(triton::LoadOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     Type retType = getTypeConverter()->convertType(op.getType());
     rewriter.replaceOpWithNewOp<triton::LoadOp>(
-      op, retType,
-      adaptor.ptr(), adaptor.mask(), adaptor.other(),
-      adaptor.cache(), adaptor.evict(), adaptor.isVolatile()
-    );
+        op, retType, adaptor.ptr(), adaptor.mask(), adaptor.other(),
+        adaptor.cache(), adaptor.evict(), adaptor.isVolatile());
     return success();
   }
 };
@@ -199,11 +196,11 @@ struct TritonLoadPattern : public OpConversionPattern<triton::LoadOp> {
 struct TritonStorePattern : public OpConversionPattern<triton::StoreOp> {
   using OpConversionPattern<triton::StoreOp>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(triton::StoreOp op, OpAdaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(triton::StoreOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     auto newOp = rewriter.replaceOpWithNewOp<triton::StoreOp>(
-      op, adaptor.ptr(), adaptor.value(), adaptor.mask()
-    );
+        op, adaptor.ptr(), adaptor.value(), adaptor.mask());
     return success();
   }
 };
@@ -212,12 +209,11 @@ template <class Op>
 struct TritonGenericPattern : public OpConversionPattern<Op> {
   using OpConversionPattern<Op>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(Op op, typename Op::Adaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(Op op, typename Op::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     Type retType = this->getTypeConverter()->convertType(op.getType());
-    rewriter.replaceOpWithNewOp<Op>(
-      op, retType, adaptor.getOperands()
-    );
+    rewriter.replaceOpWithNewOp<Op>(op, retType, adaptor.getOperands());
     return success();
   }
 };
@@ -225,30 +221,25 @@ struct TritonGenericPattern : public OpConversionPattern<Op> {
 struct TritonReducePattern : public OpConversionPattern<triton::ReduceOp> {
   using OpConversionPattern<triton::ReduceOp>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(triton::ReduceOp op, OpAdaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(triton::ReduceOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     Type retType = this->getTypeConverter()->convertType(op.getType());
     auto newOp = rewriter.replaceOpWithNewOp<triton::ReduceOp>(
-      op, retType, adaptor.redOp(), adaptor.operand(), adaptor.axis()
-    );
+        op, retType, adaptor.redOp(), adaptor.operand(), adaptor.axis());
     return success();
   }
 };
 
-void populateTritonPatterns(
-  TritonGPUTypeConverter& typeConverter, RewritePatternSet &patterns
-) {
+void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
+                            RewritePatternSet &patterns) {
   MLIRContext *context = patterns.getContext();
   patterns.add<TritonGenericPattern<triton::ReshapeOp>,
                TritonGenericPattern<triton::SplatOp>,
                TritonGenericPattern<triton::BroadcastOp>,
-               TritonGenericPattern<triton::GEPOp>,
-               TritonReducePattern,
-               TritonMakeRangePattern,
-               TritonDotPattern,
-               TritonLoadPattern,
-               TritonStorePattern
-              >(typeConverter, context);
+               TritonGenericPattern<triton::GEPOp>, TritonReducePattern,
+               TritonMakeRangePattern, TritonDotPattern, TritonLoadPattern,
+               TritonStorePattern>(typeConverter, context);
 }
 
 //
@@ -259,17 +250,19 @@ void populateTritonPatterns(
 struct SCFForPattern : public OpConversionPattern<scf::ForOp> {
   using OpConversionPattern<scf::ForOp>::OpConversionPattern;
   // Ref: ConvertForOpTypes
-  LogicalResult matchAndRewrite(scf::ForOp op, OpAdaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
-    auto newOp = cast<scf::ForOp>(rewriter.cloneWithoutRegions(*op.getOperation()));
+  LogicalResult
+  matchAndRewrite(scf::ForOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto newOp =
+        cast<scf::ForOp>(rewriter.cloneWithoutRegions(*op.getOperation()));
     rewriter.inlineRegionBefore(op.getLoopBody(), newOp.getLoopBody(),
                                 newOp.getLoopBody().end());
 
     // Now, update all the types.
 
     // Convert the types of block arguments within the given region. This
-    // replaces each block with a new block containing the updated signature. The
-    // entry block may have a special conversion if `entryConversion` is
+    // replaces each block with a new block containing the updated signature.
+    // The entry block may have a special conversion if `entryConversion` is
     // provided. On success, the new entry block to the region is returned for
     // convenience. Otherwise, failure is returned.
     if (failed(rewriter.convertRegionTypes(&newOp.getLoopBody(),
@@ -299,33 +292,27 @@ struct SCFForPattern : public OpConversionPattern<scf::ForOp> {
 struct SCFYieldPattern : public OpConversionPattern<scf::YieldOp> {
   using OpConversionPattern<scf::YieldOp>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(scf::YieldOp op, OpAdaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(scf::YieldOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     // rewriter.setInsertionPointToEnd(rewriter.getInsertionBlock());
     // rewriter.create<scf::YieldOp>(op.getLoc(), adaptor.getOperands());
     // op.erase();
-    rewriter.replaceOpWithNewOp<scf::YieldOp>(
-      op, adaptor.getOperands()
-    );
+    rewriter.replaceOpWithNewOp<scf::YieldOp>(op, adaptor.getOperands());
     return success();
   }
 };
 
-void populateSCFPatterns(
-  TritonGPUTypeConverter &typeConverter, RewritePatternSet &patterns
-) {
+void populateSCFPatterns(TritonGPUTypeConverter &typeConverter,
+                         RewritePatternSet &patterns) {
   MLIRContext *context = patterns.getContext();
-  patterns.add<SCFYieldPattern, SCFForPattern
-              >(typeConverter, context);
+  patterns.add<SCFYieldPattern, SCFForPattern>(typeConverter, context);
 }
 
-
-class ConvertTritonToTritonGPU :
-    public ConvertTritonToTritonGPUBase<ConvertTritonToTritonGPU> {
+class ConvertTritonToTritonGPU
+    : public ConvertTritonToTritonGPUBase<ConvertTritonToTritonGPU> {
 public:
-  ConvertTritonToTritonGPU(int numWarps) {
-    this->numWarps = numWarps;
-  }
+  ConvertTritonToTritonGPU(int numWarps) { this->numWarps = numWarps; }
 
   void runOnOperation() override {
     MLIRContext *context = &getContext();
@@ -339,21 +326,21 @@ public:
     // add rules
     populateArithmeticPatternsAndLegality(typeConverter, patterns, target);
     populateTritonPatterns(typeConverter, patterns);
-    // TODO: can we use 
+    // TODO: can we use
     //    mlir::scf::populateSCFStructurealTypeConversionsAndLegality(...) here?
     populateSCFPatterns(typeConverter, patterns);
 
-    if(failed(applyPartialConversion(mod, target, std::move(patterns))))
-        return signalPassFailure();
+    if (failed(applyPartialConversion(mod, target, std::move(patterns))))
+      return signalPassFailure();
 
     // update layouts
     //  broadcast src => multicast, dst => broadcasted
-    if(failed(target.refineLayouts(mod, numWarps)))
+    if (failed(target.refineLayouts(mod, numWarps)))
       return signalPassFailure();
   }
 };
 
-}
+} // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>>
 mlir::triton::createConvertTritonToTritonGPUPass(int numWarps) {
diff --git a/lib/Dialect/Triton/IR/Dialect.cpp b/lib/Dialect/Triton/IR/Dialect.cpp
index ff7ce0436..4b286e5b8 100644
--- a/lib/Dialect/Triton/IR/Dialect.cpp
+++ b/lib/Dialect/Triton/IR/Dialect.cpp
@@ -7,7 +7,6 @@
 
 #include "mlir/IR/DialectImplementation.h"
 
-
 #include "triton/Dialect/Triton/IR/Dialect.cpp.inc"
 
 using namespace mlir;
@@ -19,12 +18,13 @@ void TritonDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
 #include "triton/Dialect/Triton/IR/Ops.cpp.inc"
-               >();
+      >();
 
   // We can also add interface here.
 }
 
-Operation *TritonDialect::materializeConstant(OpBuilder &builder, Attribute value,
-                                          Type type, Location loc) {
+Operation *TritonDialect::materializeConstant(OpBuilder &builder,
+                                              Attribute value, Type type,
+                                              Location loc) {
   return builder.create<arith::ConstantOp>(loc, type, value);
 }
\ No newline at end of file
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
index fd911b7a3..3d9204183 100644
--- a/lib/Dialect/Triton/IR/Ops.cpp
+++ b/lib/Dialect/Triton/IR/Ops.cpp
@@ -13,14 +13,16 @@ namespace triton {
 static Type getI1SameShape(Type type) {
   auto i1Type = IntegerType::get(type.getContext(), 1);
   if (auto tensorType = type.dyn_cast<RankedTensorType>())
-    return RankedTensorType::get(tensorType.getShape(), i1Type, tensorType.getEncoding());
+    return RankedTensorType::get(tensorType.getShape(), i1Type,
+                                 tensorType.getEncoding());
   return Type();
 }
 
 static Type getI32SameShape(Type type) {
   auto i32Type = IntegerType::get(type.getContext(), 32);
   if (auto tensorType = type.dyn_cast<RankedTensorType>())
-    return RankedTensorType::get(tensorType.getShape(), i32Type, tensorType.getEncoding());
+    return RankedTensorType::get(tensorType.getShape(), i32Type,
+                                 tensorType.getEncoding());
   return Type();
 }
 
@@ -34,8 +36,8 @@ static Type getPointerTypeFromTensor(Type type) {
   return Type();
 }
 
-}
-}
+} // namespace triton
+} // namespace mlir
 
 #define GET_OP_CLASSES
 #include "triton/Dialect/Triton/IR/Ops.cpp.inc"
@@ -48,50 +50,48 @@ namespace triton {
 
 //-- StoreOp --
 // Default mask
-void StoreOp::build(::mlir::OpBuilder &builder, ::mlir::OperationState &state, ::mlir::Value ptr, ::mlir::Value value) {
+void StoreOp::build(::mlir::OpBuilder &builder, ::mlir::OperationState &state,
+                    ::mlir::Value ptr, ::mlir::Value value) {
   TensorType ptrType = ptr.getType().dyn_cast<TensorType>();
   auto shape = ptrType.getShape();
   ::mlir::Value mask = builder.create<arith::ConstantOp>(
-    ptr.getLoc(),
-    RankedTensorType::get(shape, builder.getI1Type()),
-    DenseIntElementsAttr::get(
-      RankedTensorType::get(shape, builder.getI1Type()), true
-    )
-  );
+      ptr.getLoc(), RankedTensorType::get(shape, builder.getI1Type()),
+      DenseIntElementsAttr::get(
+          RankedTensorType::get(shape, builder.getI1Type()), true));
   state.addOperands(ptr);
   state.addOperands(value);
   state.addOperands(mask);
 }
 
 //-- LoadOp --
-void LoadOp::build(::mlir::OpBuilder &builder, ::mlir::OperationState &state, ::mlir::Value ptr,
-                   ::mlir::triton::CacheModifier cache, ::mlir::triton::EvictionPolicy evict, bool isVolatile) {
+void LoadOp::build(::mlir::OpBuilder &builder, ::mlir::OperationState &state,
+                   ::mlir::Value ptr, ::mlir::triton::CacheModifier cache,
+                   ::mlir::triton::EvictionPolicy evict, bool isVolatile) {
   TensorType ptrType = ptr.getType().dyn_cast<TensorType>();
-  Type elementType = ptrType.getElementType().dyn_cast<PointerType>().getPointeeType();
+  Type elementType =
+      ptrType.getElementType().dyn_cast<PointerType>().getPointeeType();
   auto shape = ptrType.getShape();
   // mask
   ::mlir::Value mask = builder.create<arith::ConstantOp>(
-    ptr.getLoc(),
-    RankedTensorType::get(shape, builder.getI1Type()),
-    DenseIntElementsAttr::get(
-      RankedTensorType::get(shape, builder.getI1Type()), true
-    )
-  );
+      ptr.getLoc(), RankedTensorType::get(shape, builder.getI1Type()),
+      DenseIntElementsAttr::get(
+          RankedTensorType::get(shape, builder.getI1Type()), true));
   // other
   Type resultType = RankedTensorType::get(shape, elementType);
   ::mlir::Value other = builder.create<arith::ConstantOp>(
-    ptr.getLoc(),
-    resultType,
-    DenseElementsAttr::get(
-      resultType, builder.getZeroAttr(elementType)
-    )
-  );
+      ptr.getLoc(), resultType,
+      DenseElementsAttr::get(resultType, builder.getZeroAttr(elementType)));
   state.addOperands(ptr);
   state.addOperands(mask);
   state.addOperands(other);
-  state.addAttribute(cacheAttrName(state.name), ::mlir::triton::CacheModifierAttr::get(builder.getContext(), cache));
-  state.addAttribute(evictAttrName(state.name), ::mlir::triton::EvictionPolicyAttr::get(builder.getContext(), evict));
-  state.addAttribute(isVolatileAttrName(state.name), builder.getBoolAttr(isVolatile));
+  state.addAttribute(
+      cacheAttrName(state.name),
+      ::mlir::triton::CacheModifierAttr::get(builder.getContext(), cache));
+  state.addAttribute(
+      evictAttrName(state.name),
+      ::mlir::triton::EvictionPolicyAttr::get(builder.getContext(), evict));
+  state.addAttribute(isVolatileAttrName(state.name),
+                     builder.getBoolAttr(isVolatile));
   state.addTypes({resultType});
 }
 
diff --git a/lib/Dialect/Triton/IR/Types.cpp b/lib/Dialect/Triton/IR/Types.cpp
index 66e8c7b05..5884a2ec4 100644
--- a/lib/Dialect/Triton/IR/Types.cpp
+++ b/lib/Dialect/Triton/IR/Types.cpp
@@ -1,6 +1,6 @@
-#include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "mlir/IR/DialectImplementation.h" // required by `Types.cpp.inc`
+#include "triton/Dialect/Triton/IR/Dialect.h"
 #include "llvm/ADT/TypeSwitch.h" // required by `Types.cpp.inc`
 
 using namespace mlir;
@@ -16,7 +16,7 @@ void TritonDialect::registerTypes() {
   addTypes<
 #define GET_TYPEDEF_LIST
 #include "triton/Dialect/Triton/IR/Types.cpp.inc"
-  >();
+      >();
 }
 
 Type PointerType::parse(AsmParser &parser) {
diff --git a/lib/Dialect/Triton/Transforms/Combine.cpp b/lib/Dialect/Triton/Transforms/Combine.cpp
index 2fc073c05..ca5841aad 100644
--- a/lib/Dialect/Triton/Transforms/Combine.cpp
+++ b/lib/Dialect/Triton/Transforms/Combine.cpp
@@ -17,21 +17,23 @@ namespace {
 class CombineDotOp : public mlir::RewritePattern {
 public:
   CombineDotOp(mlir::MLIRContext *context)
-    : mlir::RewritePattern(mlir::RewritePattern::MatchAnyOpTypeTag(), 1, context) {}
-  mlir::LogicalResult matchAndRewrite(mlir::Operation *op,
-                                      mlir::PatternRewriter &rewriter) const override {
+      : mlir::RewritePattern(mlir::RewritePattern::MatchAnyOpTypeTag(), 1,
+                             context) {}
+  mlir::LogicalResult
+  matchAndRewrite(mlir::Operation *op,
+                  mlir::PatternRewriter &rewriter) const override {
     if (llvm::isa<mlir::arith::AddIOp, mlir::arith::AddFOp>(op)) {
       if (isCandidate(op->getOperand(0)).succeeded()) {
         auto dotOp = op->getOperand(0).getDefiningOp<mlir::triton::DotOp>();
         rewriter.replaceOpWithNewOp<mlir::triton::DotOp>(
-          op, dotOp->getResultTypes().front(), dotOp.a(),
-          dotOp.b(), op->getOperand(1), dotOp.allowTF32());
+            op, dotOp->getResultTypes().front(), dotOp.a(), dotOp.b(),
+            op->getOperand(1), dotOp.allowTF32());
         return mlir::success();
       } else if (isCandidate(op->getOperand(1)).succeeded()) {
         auto dotOp = op->getOperand(1).getDefiningOp<mlir::triton::DotOp>();
         rewriter.replaceOpWithNewOp<mlir::triton::DotOp>(
-          op, dotOp->getResultTypes().front(), dotOp.a(),
-          dotOp.b(), op->getOperand(0), dotOp.allowTF32());
+            op, dotOp->getResultTypes().front(), dotOp.a(), dotOp.b(),
+            op->getOperand(0), dotOp.allowTF32());
         return mlir::success();
       }
     }
@@ -54,7 +56,7 @@ private:
       return true;
     // broadcast(constant_0)
     if (auto bc = val.getDefiningOp<mlir::triton::BroadcastOp>()) {
-      if (mlir::matchPattern(bc.src(), mlir::m_Zero()) || 
+      if (mlir::matchPattern(bc.src(), mlir::m_Zero()) ||
           mlir::matchPattern(bc.src(), mlir::m_AnyZeroFloat()))
         return true;
     }
@@ -68,18 +70,19 @@ private:
 class CombineGEPOp : public mlir::RewritePattern {
 public:
   CombineGEPOp(mlir::MLIRContext *context)
-    : mlir::RewritePattern(mlir::RewritePattern::MatchAnyOpTypeTag(), 1, context) {}
+      : mlir::RewritePattern(mlir::RewritePattern::MatchAnyOpTypeTag(), 1,
+                             context) {}
 
-  mlir::LogicalResult matchAndRewrite(mlir::Operation *op,
-                                      mlir::PatternRewriter &rewriter) const override {
+  mlir::LogicalResult
+  matchAndRewrite(mlir::Operation *op,
+                  mlir::PatternRewriter &rewriter) const override {
     if (llvm::isa<mlir::triton::GEPOp>(op)) {
       if (auto gep2 = op->getOperand(0).getDefiningOp<mlir::triton::GEPOp>()) {
         auto loc = op->getLoc();
         mlir::Value newIdx = rewriter.create<mlir::arith::AddIOp>(
-          loc, op->getOperand(1), gep2->getOperand(1));
+            loc, op->getOperand(1), gep2->getOperand(1));
         rewriter.replaceOpWithNewOp<mlir::triton::GEPOp>(
-          op, op->getResultTypes().front(), gep2->getOperand(0), newIdx
-        );
+            op, op->getResultTypes().front(), gep2->getOperand(0), newIdx);
         return mlir::success();
       }
     }
@@ -92,20 +95,21 @@ public:
 class CombineSelectMaskedLoadOp : public mlir::RewritePattern {
 public:
   CombineSelectMaskedLoadOp(mlir::MLIRContext *context)
-    : mlir::RewritePattern(mlir::RewritePattern::MatchAnyOpTypeTag(), 1, context) {}
+      : mlir::RewritePattern(mlir::RewritePattern::MatchAnyOpTypeTag(), 1,
+                             context) {}
 
-  mlir::LogicalResult matchAndRewrite(mlir::Operation *op,
-                                      mlir::PatternRewriter &rewriter) const override {
+  mlir::LogicalResult
+  matchAndRewrite(mlir::Operation *op,
+                  mlir::PatternRewriter &rewriter) const override {
     if (llvm::isa<mlir::SelectOp>(op)) {
       if (auto load = op->getOperand(1).getDefiningOp<mlir::triton::LoadOp>()) {
         mlir::Value cond = op->getOperand(0);
         if (auto bc = load.mask().getDefiningOp<mlir::triton::BroadcastOp>()) {
           if (bc.src().getDefiningOp() == cond.getDefiningOp()) {
             rewriter.replaceOpWithNewOp<mlir::triton::LoadOp>(
-              op, op->getResultTypes().front(),
-              load.ptr(), load.mask(), op->getOperand(2),
-              load.cache(), load.evict(), load.isVolatile()
-            );
+                op, op->getResultTypes().front(), load.ptr(), load.mask(),
+                op->getOperand(2), load.cache(), load.evict(),
+                load.isVolatile());
             return mlir::success();
           }
         }
@@ -120,11 +124,11 @@ public:
 class CombineBroadcastConstantOp : public mlir::RewritePattern {
 public:
   CombineBroadcastConstantOp(mlir::MLIRContext *context)
-    : mlir::RewritePattern(mlir::RewritePattern::MatchAnyOpTypeTag(), 1,
-                           context) {}
-  
+      : mlir::RewritePattern(mlir::RewritePattern::MatchAnyOpTypeTag(), 1,
+                             context) {}
+
   LogicalResult matchAndRewrite(Operation *op,
-                                 PatternRewriter &rewriter) const override {
+                                PatternRewriter &rewriter) const override {
     if (auto broadcast = llvm::dyn_cast<triton::BroadcastOp>(op)) {
       if (auto cst = broadcast.src().getDefiningOp<arith::ConstantOp>()) {
         Attribute value = cst.getValue();
@@ -132,15 +136,14 @@ public:
         if (auto denseValue = value.dyn_cast<DenseElementsAttr>()) {
           if (!denseValue.isSplat())
             return failure();
-          value = DenseElementsAttr::get(resType, denseValue.getSplatValue<Attribute>());
+          value = DenseElementsAttr::get(resType,
+                                         denseValue.getSplatValue<Attribute>());
         } else {
           if (!value.isa<FloatAttr, IntegerAttr>())
             return failure();
           value = DenseElementsAttr::get(resType, value);
         }
-        rewriter.replaceOpWithNewOp<arith::ConstantOp>(
-          op, value, resType
-        );
+        rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, value, resType);
         return success();
       }
     }
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index d66a08892..127f8366e 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -11,19 +11,18 @@ using namespace mlir::triton::gpu;
 // parse an array of integers
 static LogicalResult parseIntArrayAttr(AsmParser &parser,
                                        const NamedAttribute &attr,
-                                       /*SmallVector<unsigned, 2>*/auto &res,
-                                       StringRef desc)  {
+                                       /*SmallVector<unsigned, 2>*/ auto &res,
+                                       StringRef desc) {
   auto arrayAttr = attr.getValue().dyn_cast<ArrayAttr>();
   if (!arrayAttr) {
-    parser.emitError(parser.getNameLoc(), "expected an array for ")
-            << desc;
+    parser.emitError(parser.getNameLoc(), "expected an array for ") << desc;
     return failure();
   }
   for (Attribute i : arrayAttr) {
     auto intAttr = i.dyn_cast<IntegerAttr>();
     if (!intAttr) {
       parser.emitError(parser.getNameLoc(), "expected an integer value in ")
-              << desc;
+          << desc;
       return failure();
     }
     res.push_back(intAttr.getUInt());
@@ -46,7 +45,7 @@ static Attribute parseBlocked(AsmParser &parser, Type type) {
     return {};
   if (parser.parseGreater().failed())
     return {};
-  
+
   SmallVector<unsigned, 2> threadTileSize;
   SmallVector<unsigned, 2> warpTileSize;
   SmallVector<unsigned, 2> blockTileSize;
@@ -55,19 +54,23 @@ static Attribute parseBlocked(AsmParser &parser, Type type) {
 
   for (const NamedAttribute &attr : dict) {
     if (attr.getName() == "threadTileSize") {
-      if (parseIntArrayAttr(parser, attr, threadTileSize, "thread tile size").failed())
+      if (parseIntArrayAttr(parser, attr, threadTileSize, "thread tile size")
+              .failed())
         return {};
     } else if (attr.getName() == "warpTileSize") {
-      if (parseIntArrayAttr(parser, attr, warpTileSize, "warp tile size").failed())
+      if (parseIntArrayAttr(parser, attr, warpTileSize, "warp tile size")
+              .failed())
         return {};
     } else if (attr.getName() == "blockTileSize") {
-      if (parseIntArrayAttr(parser, attr, blockTileSize, "block tile size").failed())
+      if (parseIntArrayAttr(parser, attr, blockTileSize, "block tile size")
+              .failed())
         return {};
     } else if (attr.getName() == "order") {
       if (parseIntArrayAttr(parser, attr, order, "order").failed())
         return {};
     } else if (attr.getName() == "broadcastAxis") {
-      if (parseIntArrayAttr(parser, attr, broadcastAxis, "broadcastAxis").failed())
+      if (parseIntArrayAttr(parser, attr, broadcastAxis, "broadcastAxis")
+              .failed())
         return {};
     } else {
       parser.emitError(parser.getNameLoc(), "unexpected key: ")
@@ -76,12 +79,9 @@ static Attribute parseBlocked(AsmParser &parser, Type type) {
     }
   }
 
-  return parser.getChecked<TritonGPUBlockedEncodingAttr>(parser.getContext(),
-                                                         threadTileSize,
-                                                         warpTileSize,
-                                                         blockTileSize,
-                                                         order,
-                                                         broadcastAxis);
+  return parser.getChecked<TritonGPUBlockedEncodingAttr>(
+      parser.getContext(), threadTileSize, warpTileSize, blockTileSize, order,
+      broadcastAxis);
 }
 
 static void printBlocked(AsmPrinter &printer, auto *attr) {
@@ -94,8 +94,7 @@ static void printBlocked(AsmPrinter &printer, auto *attr) {
           << "}>";
 }
 
-Attribute 
-TritonGPUBlockedEncodingAttr::parse(AsmParser &parser, Type type) {
+Attribute TritonGPUBlockedEncodingAttr::parse(AsmParser &parser, Type type) {
   parseBlocked(parser, type);
 }
 
@@ -103,8 +102,8 @@ void TritonGPUBlockedEncodingAttr::print(mlir::AsmPrinter &printer) const {
   printBlocked(printer, this);
 }
 
-Attribute
-TritonGPUBlockedMulticastEncodingAttr::parse(AsmParser &parser, Type type) {
+Attribute TritonGPUBlockedMulticastEncodingAttr::parse(AsmParser &parser,
+                                                       Type type) {
   parseBlocked(parser, type);
 }
 
@@ -131,38 +130,37 @@ static Attribute parseMma(AsmParser &parser, Type type) {
 
   for (const NamedAttribute &attr : dict) {
     if (attr.getName() == "fragmentPerWarp") {
-      if (parseIntArrayAttr(parser, attr, fragmentPerWarp, "fragmentPerWarp").failed())
+      if (parseIntArrayAttr(parser, attr, fragmentPerWarp, "fragmentPerWarp")
+              .failed())
         return {};
     } else if (attr.getName() == "shapePerWarp") {
-      if (parseIntArrayAttr(parser, attr, shapePerWarp, "shapePerWarp").failed())
+      if (parseIntArrayAttr(parser, attr, shapePerWarp, "shapePerWarp")
+              .failed())
         return {};
     } else if (attr.getName() == "warpPerTile") {
       if (parseIntArrayAttr(parser, attr, warpPerTile, "warpPerTile").failed())
         return {};
     } else if (attr.getName() == "shapePerTile") {
-      if (parseIntArrayAttr(parser, attr, shapePerTile, "shapePerTile").failed())
+      if (parseIntArrayAttr(parser, attr, shapePerTile, "shapePerTile")
+              .failed())
         return {};
     } else if (attr.getName() == "repetitions") {
       if (parseIntArrayAttr(parser, attr, repetitions, "repetitions").failed())
         return {};
     } else if (attr.getName() == "contigPerThread") {
-      if (parseIntArrayAttr(parser, attr, contigPerThread, "contigPerThread").failed())
+      if (parseIntArrayAttr(parser, attr, contigPerThread, "contigPerThread")
+              .failed())
         return {};
     } else {
       parser.emitError(parser.getNameLoc(), "unexpected key: ")
-             << attr.getName().strref();
+          << attr.getName().strref();
       return {};
     }
   }
 
-  return parser.getChecked<TritonGPUMmaEncodingAttr>(parser.getContext(),
-                                                     fragmentPerWarp,
-                                                     shapePerWarp,
-                                                     warpPerTile,
-                                                     shapePerTile,
-                                                     repetitions,
-                                                     contigPerThread,
-                                                     broadcastAxis);
+  return parser.getChecked<TritonGPUMmaEncodingAttr>(
+      parser.getContext(), fragmentPerWarp, shapePerWarp, warpPerTile,
+      shapePerTile, repetitions, contigPerThread, broadcastAxis);
 }
 
 static void printMma(AsmPrinter &printer, auto *attr) {
@@ -176,8 +174,7 @@ static void printMma(AsmPrinter &printer, auto *attr) {
           << "}>";
 }
 
-Attribute 
-TritonGPUMmaEncodingAttr::parse(AsmParser &parser, Type type) {
+Attribute TritonGPUMmaEncodingAttr::parse(AsmParser &parser, Type type) {
   return parseMma(parser, type);
 }
 
@@ -185,8 +182,8 @@ void TritonGPUMmaEncodingAttr::print(AsmPrinter &printer) const {
   printMma(printer, this);
 }
 
-Attribute
-TritonGPUMmaMulticastEncodingAttr::parse(AsmParser &parser, Type type) {
+Attribute TritonGPUMmaMulticastEncodingAttr::parse(AsmParser &parser,
+                                                   Type type) {
   return parseMma(parser, type);
 }
 
@@ -194,8 +191,7 @@ void TritonGPUMmaMulticastEncodingAttr::print(AsmPrinter &printer) const {
   printMma(printer, this);
 }
 
-Attribute
-TritonGPUSharedEncodingAttr::parse(AsmParser &parser, Type type) {
+Attribute TritonGPUSharedEncodingAttr::parse(AsmParser &parser, Type type) {
   if (parser.parseLess().failed())
     return {};
   // Parse the data as a dictionary
@@ -210,8 +206,7 @@ TritonGPUSharedEncodingAttr::parse(AsmParser &parser, Type type) {
   unsigned maxPhase = 0;
   SmallVector<unsigned, 2> order;
 
-  auto parseUInt = [&parser](const NamedAttribute &attr,
-                             unsigned &value,
+  auto parseUInt = [&parser](const NamedAttribute &attr, unsigned &value,
                              StringRef desc) -> LogicalResult {
     auto intAttr = attr.getValue().dyn_cast<IntegerAttr>();
     if (!intAttr) {
@@ -237,29 +232,25 @@ TritonGPUSharedEncodingAttr::parse(AsmParser &parser, Type type) {
         return {};
     } else {
       parser.emitError(parser.getNameLoc(), "unexpected key: ")
-             << attr.getName().strref();
+          << attr.getName().strref();
       return {};
     }
   }
 
-  return parser.getChecked<TritonGPUSharedEncodingAttr>(parser.getContext(),
-                                                        vec,
-                                                        perPhase,
-                                                        maxPhase,
-                                                        order);
+  return parser.getChecked<TritonGPUSharedEncodingAttr>(
+      parser.getContext(), vec, perPhase, maxPhase, order);
 }
 
 void TritonGPUSharedEncodingAttr::print(AsmPrinter &printer) const {
   printer << "<{"
-          << "vec = " << getVec()
-          << ", perPhase = " << getPerPhase()
-          << ", maxPhase = " << getMaxPhase()
-          << ", order = [" << getOrder() << "]"
+          << "vec = " << getVec() << ", perPhase = " << getPerPhase()
+          << ", maxPhase = " << getMaxPhase() << ", order = [" << getOrder()
+          << "]"
           << "}>";
 }
 
 class TritonGPUOpAsmInterface : public OpAsmDialectInterface {
- public:
+public:
   using OpAsmDialectInterface::OpAsmDialectInterface;
 
   AliasResult getAlias(Attribute attr, raw_ostream &os) const override {
@@ -289,7 +280,7 @@ class TritonGPUOpAsmInterface : public OpAsmDialectInterface {
     OpAsmDialectInterface::getAlias(attr, os);
   }
 
- private:
+private:
   static void printMma(const auto &attr, raw_ostream &os) {
     TritonGPUOpAsmInterface::printArray(attr.getFragmentPerWarp(), os);
     TritonGPUOpAsmInterface::printArray(attr.getShapePerWarp(), os);
@@ -338,7 +329,7 @@ void TritonGPUDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
 #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc"
-  >();
+      >();
   addInterfaces<TritonGPUOpAsmInterface>();
 }
 
@@ -349,7 +340,8 @@ namespace triton {
 static Type getI1SameShape(Type type) {
   auto i1Type = IntegerType::get(type.getContext(), 1);
   if (auto tensorType = type.dyn_cast<RankedTensorType>())
-    return RankedTensorType::get(tensorType.getShape(), i1Type, tensorType.getEncoding());
+    return RankedTensorType::get(tensorType.getShape(), i1Type,
+                                 tensorType.getEncoding());
   return Type();
 }
 
@@ -368,8 +360,8 @@ static Type getPointeeType(Type type) {
   return Type();
 }
 
-}
-}
+} // namespace triton
+} // namespace mlir
 
 static LogicalResult verify(CopyAsyncOp op) {
   Type resType = op.getResult().getType();
@@ -385,11 +377,9 @@ static LogicalResult verify(CopyAsyncOp op) {
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc"
 
-
 // verify TritonGPU ops
-LogicalResult
-TritonGPUDialect::verifyOperationAttribute(Operation *op,
-                                           NamedAttribute attr) {
+LogicalResult TritonGPUDialect::verifyOperationAttribute(Operation *op,
+                                                         NamedAttribute attr) {
   // TODO: fill this.
   return success();
 }
diff --git a/lib/Dialect/TritonGPU/Transforms/Combine.cpp b/lib/Dialect/TritonGPU/Transforms/Combine.cpp
index 0052a3975..92b9127a3 100644
--- a/lib/Dialect/TritonGPU/Transforms/Combine.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Combine.cpp
@@ -27,8 +27,8 @@ namespace {
 #define GEN_PASS_CLASSES
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
 
-class TritonGPUCombineOpsPass 
-  : public TritonGPUCombineOpsBase<TritonGPUCombineOpsPass> {
+class TritonGPUCombineOpsPass
+    : public TritonGPUCombineOpsBase<TritonGPUCombineOpsPass> {
 public:
   void runOnOperation() override {
     MLIRContext *context = &getContext();
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
index b68276678..13e807921 100644
--- a/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
@@ -6,12 +6,11 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements loop software pipelining
-// The implementation here is inspired by the pipeline pass in Triton (-v2.0) 
+// The implementation here is inspired by the pipeline pass in Triton (-v2.0)
 // and SCF's LoopPipelining.
 //
 //===----------------------------------------------------------------------===//
 
-
 using namespace mlir;
 
 #define GEN_PASS_CLASSES
@@ -41,14 +40,15 @@ class LoopPipeliner {
   /// Block arguments that loads depend on
   DenseSet<BlockArgument> depArgs;
   /// Operations (inside the loop body) that loads depend on
-  DenseSet<Operation*> depOps;
+  DenseSet<Operation *> depOps;
 
   /// collect values that v depends on and are defined inside the loop
   void collectDeps(Value v, int stages, DenseSet<Value> &deps);
 
   void setValueMapping(Value origin, Value newValue, int stage);
+
 public:
-  LoopPipeliner(scf::ForOp forOp, int numStages) 
+  LoopPipeliner(scf::ForOp forOp, int numStages)
       : forOp(forOp), numStages(numStages) {
     // cache yieldOp
     yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
@@ -86,7 +86,7 @@ void LoopPipeliner::collectDeps(Value v, int stages, DenseSet<Value> &deps) {
   if (auto arg = v.dyn_cast<BlockArgument>()) {
     deps.insert(v);
     // Note: we have iv as the first arg, so the op idx is arg.getArgNumber()-1
-    collectDeps(yieldOp->getOperand(arg.getArgNumber() - 1), stages-1, deps);
+    collectDeps(yieldOp->getOperand(arg.getArgNumber() - 1), stages - 1, deps);
   } else { // value
     // v might be in deps, but we still need to visit v.
     // This is because v might depends on value in previous iterations
@@ -123,8 +123,8 @@ LogicalResult LoopPipeliner::initialize() {
   }
 
   // for (triton::LoadOp loadOp : allLoads) {
-  //   llvm::errs() << loadOp << " depends on: #" << loadDeps[loadOp].size() << " values\n";
-  //   for (Value dep : loadDeps[loadOp])
+  //   llvm::errs() << loadOp << " depends on: #" << loadDeps[loadOp].size() <<
+  //   " values\n"; for (Value dep : loadDeps[loadOp])
   //     llvm::errs() << dep << "\n";
   //   llvm::errs() << "\n";
   // }
@@ -147,9 +147,13 @@ LogicalResult LoopPipeliner::initialize() {
     if (isCandiate && loadOp.getResult().hasOneUse()) {
       isCandiate = false;
       Operation *use = *loadOp.getResult().getUsers().begin();
-      if (auto convertLayout = llvm::dyn_cast<triton::gpu::ConvertLayoutOp>(use)) {
-        if (auto tensorType = convertLayout.getResult().getType().dyn_cast<RankedTensorType>()) {
-          if (tensorType.getEncoding().isa<triton::gpu::TritonGPUSharedEncodingAttr>()) {
+      if (auto convertLayout =
+              llvm::dyn_cast<triton::gpu::ConvertLayoutOp>(use)) {
+        if (auto tensorType = convertLayout.getResult()
+                                  .getType()
+                                  .dyn_cast<RankedTensorType>()) {
+          if (tensorType.getEncoding()
+                  .isa<triton::gpu::TritonGPUSharedEncodingAttr>()) {
             isCandiate = true;
             loadsMapping[loadOp] = convertLayout;
           }
@@ -162,7 +166,6 @@ LogicalResult LoopPipeliner::initialize() {
       loads.insert(loadOp);
   }
 
-
   // we have some loads to pipeline
   if (!loads.empty()) {
     // update depArgs & depOps
@@ -202,10 +205,10 @@ void LoopPipeliner::emitPrologue() {
 
     // special handling for loop condition as there is no condition in ForOp
     Value loopCond = builder.create<arith::CmpIOp>(
-      iv.getLoc(), arith::CmpIPredicate::slt, iv, forOp.getUpperBound());
+        iv.getLoc(), arith::CmpIPredicate::slt, iv, forOp.getUpperBound());
 
     // rematerialize peeled values
-    SmallVector<Operation*> orderedDeps;
+    SmallVector<Operation *> orderedDeps;
     for (Operation &op : forOp.getLoopBody().front()) {
       if (depOps.contains(&op))
         orderedDeps.push_back(&op);
@@ -221,10 +224,9 @@ void LoopPipeliner::emitPrologue() {
         // TODO: check if the hardware supports copyasync
         if (auto loadOp = llvm::dyn_cast<triton::LoadOp>(op)) {
           newOp = builder.create<triton::gpu::CopyAsyncOp>(
-            op->getLoc(), loadsMapping[loadOp].getType(),
-            loadOp.ptr(), loadOp.mask(), loadOp.other(),
-            loadOp.cache(), loadOp.evict(), loadOp.isVolatile()
-          );
+              op->getLoc(), loadsMapping[loadOp].getType(), loadOp.ptr(),
+              loadOp.mask(), loadOp.other(), loadOp.cache(), loadOp.evict(),
+              loadOp.isVolatile());
         } else
           llvm_unreachable("This should be LoadOp");
       } else
@@ -245,12 +247,10 @@ void LoopPipeliner::emitPrologue() {
         // assert(I1 or TensorOf<[I1]>);
         OpBuilder::InsertionGuard g(builder);
         builder.setInsertionPoint(newOp);
-        Value splatCond = builder.create<triton::BroadcastOp>(mask.getLoc(),
-                                                              mask.getType(),
-                                                              loopCond);
-        Value newMask = builder.create<arith::AndIOp>(mask.getLoc(),
-                                                      mask,
-                                                      splatCond);
+        Value splatCond = builder.create<triton::BroadcastOp>(
+            mask.getLoc(), mask.getType(), loopCond);
+        Value newMask =
+            builder.create<arith::AndIOp>(mask.getLoc(), mask, splatCond);
         newOp->setOperand(1, newMask);
       }
 
@@ -264,8 +264,9 @@ void LoopPipeliner::emitPrologue() {
         // update mapping for loop-carried values (args)
         for (OpOperand &operand : yieldOp->getOpOperands()) {
           if (operand.get() == op->getResult(dstIdx))
-            setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
-                            newOp->getResult(dstIdx), stage + 1);
+            setValueMapping(
+                forOp.getRegionIterArgs()[operand.getOperandNumber()],
+                newOp->getResult(dstIdx), stage + 1);
         }
       }
     }
@@ -296,21 +297,19 @@ scf::ForOp LoopPipeliner::createNewForOp() {
   size_t depArgsBeginIdx = newLoopArgs.size();
   for (BlockArgument depArg : depArgs) {
     depArgsIdx[depArg] = newLoopArgs.size();
-    newLoopArgs.push_back(valueMapping[depArg][numStages-1]);
+    newLoopArgs.push_back(valueMapping[depArg][numStages - 1]);
   }
 
   size_t nextIVIdx = newLoopArgs.size();
-  newLoopArgs.push_back(valueMapping[forOp.getInductionVar()][numStages-2]);
+  newLoopArgs.push_back(valueMapping[forOp.getInductionVar()][numStages - 2]);
 
   for (size_t i = 0; i < newLoopArgs.size(); ++i)
     assert(newLoopArgs[i]);
 
   // 1. signature of the new ForOp
-  auto newForOp = builder.create<scf::ForOp>(forOp.getLoc(),
-                                             forOp.getLowerBound(),
-                                             forOp.getUpperBound(),
-                                             forOp.getStep(),
-                                             newLoopArgs);
+  auto newForOp = builder.create<scf::ForOp>(
+      forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(),
+      forOp.getStep(), newLoopArgs);
 
   // 2. body of the new ForOp
   builder.setInsertionPointToStart(newForOp.getBody());
@@ -329,15 +328,15 @@ scf::ForOp LoopPipeliner::createNewForOp() {
   // 3. replace loads with block args (from prologue)
   for (size_t idx = 0; idx < loads.size(); ++idx) {
     Value load = loads[idx];
-    assert(load.hasOneUse() && "we assume that this load has one use (ConvertLayout)");
+    assert(load.hasOneUse() &&
+           "we assume that this load has one use (ConvertLayout)");
     Value loadUse = load.getUsers().begin()->getResult(0);
     mapping.lookup(loadUse).replaceAllUsesWith(
-      newForOp.getRegionIterArgs()[loadIdx + idx*(numStages-1)]);
+        newForOp.getRegionIterArgs()[loadIdx + idx * (numStages - 1)]);
   }
 
-
   // 4. prefetch the next iteration
-  SmallVector<Operation*> orderedDeps;
+  SmallVector<Operation *> orderedDeps;
   for (Operation &op : forOp.getLoopBody().front()) {
     if (depOps.contains(&op))
       orderedDeps.push_back(&op);
@@ -350,41 +349,39 @@ scf::ForOp LoopPipeliner::createNewForOp() {
   DenseMap<BlockArgument, Value> depArgsMapping;
   size_t argIdx = 0;
   for (BlockArgument arg : depArgs) {
-    nextMapping.map(arg, newForOp.getRegionIterArgs()[argIdx + depArgsBeginIdx]);
+    nextMapping.map(arg,
+                    newForOp.getRegionIterArgs()[argIdx + depArgsBeginIdx]);
     ++argIdx;
   }
   // special handling for iv & loop condition
-  Value nextIV = builder.create<arith::AddIOp>(newForOp.getInductionVar().getLoc(),
-                                               newForOp.getRegionIterArgs()[nextIVIdx],
-                                               newForOp.getStep());
-  Value nextLoopCond = builder.create<arith::CmpIOp>(
-    nextIV.getLoc(), arith::CmpIPredicate::slt,
-    nextIV, newForOp.getUpperBound());
+  Value nextIV = builder.create<arith::AddIOp>(
+      newForOp.getInductionVar().getLoc(),
+      newForOp.getRegionIterArgs()[nextIVIdx], newForOp.getStep());
+  Value nextLoopCond =
+      builder.create<arith::CmpIOp>(nextIV.getLoc(), arith::CmpIPredicate::slt,
+                                    nextIV, newForOp.getUpperBound());
   for (Operation *op : orderedDeps) {
     Operation *nextOp = nullptr;
     // update loading mask
     if (loads.contains(op->getResult(0))) {
       auto loadOp = llvm::cast<triton::LoadOp>(op);
       Value mask = loadOp.mask();
-      Value splatCond = builder.create<triton::BroadcastOp>(mask.getLoc(),
-                                                            mask.getType(),
-                                                            nextLoopCond);
-      Value newMask = builder.create<arith::AndIOp>(mask.getLoc(),
-                                                    splatCond,
-                                                    nextMapping.lookupOrDefault(mask));
-      // if mask is defined outside the loop, don't update the map more than once
+      Value splatCond = builder.create<triton::BroadcastOp>(
+          mask.getLoc(), mask.getType(), nextLoopCond);
+      Value newMask = builder.create<arith::AndIOp>(
+          mask.getLoc(), splatCond, nextMapping.lookupOrDefault(mask));
+      // if mask is defined outside the loop, don't update the map more than
+      // once
       if (!(forOp.isDefinedOutsideOfLoop(mask) && nextMapping.contains(mask)))
         nextMapping.map(mask, newMask);
       // TODO: more elegant way to do this?
       nextOp = builder.create<triton::gpu::CopyAsyncOp>(
-        op->getLoc(), loadsMapping[op->getResult(0)].getType(),
-        nextMapping.lookupOrDefault(loadOp.ptr()),
-        nextMapping.lookupOrDefault(loadOp.mask()),
-        nextMapping.lookupOrDefault(loadOp.other()),
-        loadOp.cache(), loadOp.evict(), loadOp.isVolatile()
-      );
-    }
-    else
+          op->getLoc(), loadsMapping[op->getResult(0)].getType(),
+          nextMapping.lookupOrDefault(loadOp.ptr()),
+          nextMapping.lookupOrDefault(loadOp.mask()),
+          nextMapping.lookupOrDefault(loadOp.other()), loadOp.cache(),
+          loadOp.evict(), loadOp.isVolatile());
+    } else
       nextOp = builder.clone(*op, nextMapping);
     // llvm::errs() << "epilogue cloning...: " << *op << "\n";
     // update mapping of results
@@ -411,15 +408,16 @@ scf::ForOp LoopPipeliner::createNewForOp() {
   for (size_t idx = 0; idx < loads.size(); ++idx) {
     Value load = loads[idx];
     for (int stage = 1; stage < numStages - 1; ++stage) {
-      yieldValues.push_back(newForOp.getRegionIterArgs()[
-        loadIdx + idx*(numStages-1) + stage
-      ]);
+      yieldValues.push_back(
+          newForOp
+              .getRegionIterArgs()[loadIdx + idx * (numStages - 1) + stage]);
     }
     yieldValues.push_back(nextMapping.lookup(load));
   }
 
   for (size_t i = depArgsBeginIdx; i < nextIVIdx; ++i)
-    yieldValues.push_back(depArgsMapping.lookup(newForOp.getRegionIterArgs()[i]));
+    yieldValues.push_back(
+        depArgsMapping.lookup(newForOp.getRegionIterArgs()[i]));
   yieldValues.push_back(nextIV);
   builder.setInsertionPointToEnd(newForOp.getBody());
   builder.create<scf::YieldOp>(forOp.getBody()->getTerminator()->getLoc(),
@@ -430,9 +428,7 @@ scf::ForOp LoopPipeliner::createNewForOp() {
 // ref: mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
 struct PipelinePass : public TritonGPUPipelineBase<PipelinePass> {
   PipelinePass() = default;
-  PipelinePass(int numStages) {
-    this->numStages = numStages;
-  }
+  PipelinePass(int numStages) { this->numStages = numStages; }
 
   void runOnOperation() override {
     int numStages = this->numStages;
diff --git a/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp b/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp
index bedf9f38a..091ca05d3 100644
--- a/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp
@@ -1,7 +1,7 @@
 #include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h"
+#include "mlir/IR/BlockAndValueMapping.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "mlir/IR/BlockAndValueMapping.h"
 #include <algorithm>
 
 using namespace mlir;
@@ -10,7 +10,7 @@ using namespace mlir::triton::gpu;
 //
 // TypeConverter
 //
-TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context, 
+TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
                                                int numThreads)
     : context(context), numThreads(numThreads) {
   // TODO: how does MLIR pick the right conversion?
@@ -38,14 +38,14 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
     // or assert no encoding?
 
     // Now we assume:
-    //   contiguous = 1, order = 0, 1, 2, ..., 
+    //   contiguous = 1, order = 0, 1, 2, ...,
     llvm::SmallVector<unsigned> threadTileSize(rank, 1); // naive layout
     llvm::SmallVector<unsigned> warpTileSize(rank, 1);
     llvm::SmallVector<unsigned> blockTileSize(rank);
     llvm::SmallVector<unsigned> order(rank);
     llvm::SmallVector<unsigned> broadcastAxis;
     int remainingThreads = numThreads;
-    int remainingLanes = /*warp size*/32;
+    int remainingLanes = /*warp size*/ 32;
     for (int64_t dim = 0; dim < rank; ++dim) {
       blockTileSize[dim] = std::clamp(remainingThreads, 1, int(shape[dim]));
       warpTileSize[dim] = std::clamp(remainingLanes, 1, int(shape[dim]));
@@ -56,7 +56,8 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
       // TODO: will we need repetition?
     }
     Attribute encoding = triton::gpu::TritonGPUBlockedEncodingAttr::get(
-        context, threadTileSize, warpTileSize, blockTileSize, order, broadcastAxis);
+        context, threadTileSize, warpTileSize, blockTileSize, order,
+        broadcastAxis);
     return RankedTensorType::get(shape, elementType, encoding);
   });
 
@@ -65,8 +66,9 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
   //
   // This will be called when (newArgType != origArgType)
   // This will create newArg, and map(origArg, newArg)
-  addArgumentMaterialization([&](OpBuilder &builder, RankedTensorType tensorType,
-                                 ValueRange inputs, Location loc) {
+  addArgumentMaterialization([&](OpBuilder &builder,
+                                 RankedTensorType tensorType, ValueRange inputs,
+                                 Location loc) {
     llvm_unreachable("Not implemented");
     return llvm::None;
   });
@@ -74,7 +76,7 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
   // If the origValue still has live user(s), use this to
   // convert origValue to newValue
   addSourceMaterialization([&](OpBuilder &builder, RankedTensorType tensorType,
-                                 ValueRange inputs, Location loc) {
+                               ValueRange inputs, Location loc) {
     llvm_unreachable("Not implemented");
     return llvm::None;
   });
@@ -83,7 +85,7 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
   // where, desiredType = typeConverter->convertType(origType)
   // NOTE: only for remapped values.
   addTargetMaterialization([&](OpBuilder &builder, RankedTensorType tensorType,
-                                ValueRange inputs, Location loc) {
+                               ValueRange inputs, Location loc) {
     llvm_unreachable("Not implemented");
     return llvm::None;
   });
@@ -93,30 +95,31 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
 // TritonGPUConversion
 //
 TritonGPUConversionTarget::TritonGPUConversionTarget(
-  MLIRContext &context, TritonGPUTypeConverter &typeConverter)
+    MLIRContext &context, TritonGPUTypeConverter &typeConverter)
     : ConversionTarget(context), typeConverter(typeConverter) {
   // TODO: we should also verify ops of TritonGPUDialect
   addLegalDialect<triton::gpu::TritonGPUDialect>();
 
   // Some ops from SCF are illegal
-  addIllegalOp<scf::ExecuteRegionOp, scf::ParallelOp, 
-               scf::ReduceOp, scf::ReduceReturnOp>();
-
-  addDynamicallyLegalDialect<arith::ArithmeticDialect,
-                             triton::TritonDialect,
-                             StandardOpsDialect,
-                             scf::SCFDialect>([&](Operation *op) {
-    if (typeConverter.isLegal(op))
-      return true;
-    return false;
-  });
+  addIllegalOp<scf::ExecuteRegionOp, scf::ParallelOp, scf::ReduceOp,
+               scf::ReduceReturnOp>();
 
+  addDynamicallyLegalDialect<arith::ArithmeticDialect, triton::TritonDialect,
+                             StandardOpsDialect, scf::SCFDialect>(
+      [&](Operation *op) {
+        if (typeConverter.isLegal(op))
+          return true;
+        return false;
+      });
 
   // We have requirements for the data layouts
   addDynamicallyLegalOp<triton::DotOp>([this](triton::DotOp dotOp) -> bool {
-    Attribute aEncoding = dotOp.a().getType().cast<RankedTensorType>().getEncoding();
-    Attribute bEncoding = dotOp.b().getType().cast<RankedTensorType>().getEncoding();
-    if (aEncoding && aEncoding.isa<triton::gpu::TritonGPUSharedEncodingAttr>() &&
+    Attribute aEncoding =
+        dotOp.a().getType().cast<RankedTensorType>().getEncoding();
+    Attribute bEncoding =
+        dotOp.b().getType().cast<RankedTensorType>().getEncoding();
+    if (aEncoding &&
+        aEncoding.isa<triton::gpu::TritonGPUSharedEncodingAttr>() &&
         bEncoding && bEncoding.isa<triton::gpu::TritonGPUSharedEncodingAttr>())
       return true;
     // // TODO: we should delete this
@@ -124,7 +127,6 @@ TritonGPUConversionTarget::TritonGPUConversionTarget(
     //   return true;
     return false;
   });
-
 }
 
 // %dst = tt.broadcast %src
@@ -133,12 +135,10 @@ TritonGPUConversionTarget::TritonGPUConversionTarget(
 // %bcst = tt.broadcast %newSrc
 // %dst = convert_layout %bcst
 LogicalResult TritonGPUConversionTarget::refineLayouts(ModuleOp mod,
-                                                      int numThreads) {
+                                                       int numThreads) {
   // collect broadcasts
   SmallVector<triton::BroadcastOp> broadcasts;
-  mod.walk([&](triton::BroadcastOp op) {
-    broadcasts.push_back(op); 
-  });
+  mod.walk([&](triton::BroadcastOp op) { broadcasts.push_back(op); });
 
   BlockAndValueMapping mapping;
   for (auto broadcast : broadcasts) {
@@ -161,20 +161,14 @@ LogicalResult TritonGPUConversionTarget::refineLayouts(ModuleOp mod,
           broadcastAxis.push_back(ax);
 
       Attribute originSrcEnc = tensorType.getEncoding();
-      if (auto blockedEnc = originSrcEnc.dyn_cast<TritonGPUBlockedEncodingAttr>()) {
+      if (auto blockedEnc =
+              originSrcEnc.dyn_cast<TritonGPUBlockedEncodingAttr>()) {
         auto newSrcEnc = TritonGPUBlockedMulticastEncodingAttr::get(
-          blockedEnc.getContext(),
-          blockedEnc.getThreadTileSize(),
-          blockedEnc.getWarpTileSize(),
-          blockedEnc.getBlockTileSize(),
-          blockedEnc.getOrder(),
-          broadcastAxis
-        );
+            blockedEnc.getContext(), blockedEnc.getThreadTileSize(),
+            blockedEnc.getWarpTileSize(), blockedEnc.getBlockTileSize(),
+            blockedEnc.getOrder(), broadcastAxis);
         newSrcType = RankedTensorType::get(
-          tensorType.getShape(),
-          tensorType.getElementType(),
-          newSrcEnc
-        );
+            tensorType.getShape(), tensorType.getElementType(), newSrcEnc);
       } else
         llvm_unreachable("src of broadcast should have blocked encoding");
     } else {
@@ -186,34 +180,25 @@ LogicalResult TritonGPUConversionTarget::refineLayouts(ModuleOp mod,
 
     // create new src
     if (!isSrcScalar) // we don't need to convert layout for scalar values
-      src = builder.create<triton::gpu::ConvertLayoutOp>(
-        src.getLoc(), newSrcType, src
-      );
+      src = builder.create<triton::gpu::ConvertLayoutOp>(src.getLoc(),
+                                                         newSrcType, src);
 
     // create new broadcast
     // compute new type (encoding)
     auto originDstEnc = originDstTensorType.getEncoding()
-                          .dyn_cast<TritonGPUBlockedEncodingAttr>();
+                            .dyn_cast<TritonGPUBlockedEncodingAttr>();
     auto newEnc = TritonGPUBlockedEncodingAttr::get(
-      originDstEnc.getContext(),
-      originDstEnc.getThreadTileSize(),
-      originDstEnc.getWarpTileSize(),
-      originDstEnc.getBlockTileSize(),
-      originDstEnc.getOrder(),
-      broadcastAxis
-    );
-    auto newType = RankedTensorType::get(
-      originDstTensorType.getShape(),
-      originDstTensorType.getElementType(),
-      newEnc
-    );
-    Value newBroadcast = builder.create<triton::BroadcastOp>(
-      broadcast.getLoc(), newType, src
-    );
+        originDstEnc.getContext(), originDstEnc.getThreadTileSize(),
+        originDstEnc.getWarpTileSize(), originDstEnc.getBlockTileSize(),
+        originDstEnc.getOrder(), broadcastAxis);
+    auto newType =
+        RankedTensorType::get(originDstTensorType.getShape(),
+                              originDstTensorType.getElementType(), newEnc);
+    Value newBroadcast =
+        builder.create<triton::BroadcastOp>(broadcast.getLoc(), newType, src);
     // we don't want to change the encoding of the result
     Value newDst = builder.create<triton::gpu::ConvertLayoutOp>(
-      broadcast.getLoc(), originDstType, newBroadcast
-    );
+        broadcast.getLoc(), originDstType, newBroadcast);
 
     broadcast.replaceAllUsesWith(newDst);
     mapping.map(broadcast, newDst);
diff --git a/lib/Dialect/TritonGPU/Transforms/Verifier.cpp b/lib/Dialect/TritonGPU/Transforms/Verifier.cpp
index 16e1d3ec6..e88799927 100644
--- a/lib/Dialect/TritonGPU/Transforms/Verifier.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Verifier.cpp
@@ -5,7 +5,6 @@
 
 using namespace mlir;
 
-
 #define GEN_PASS_CLASSES
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
 
@@ -37,28 +36,30 @@ private:
           if (!encoding.isa<triton::gpu::TritonGPUSharedEncodingAttr>())
             return dotOp.emitError() << name << " should be of shared layout";
         } else
-          return dotOp.emitError() << name << "'s type should be of RankedTensorType";
+          return dotOp.emitError()
+                 << name << "'s type should be of RankedTensorType";
       }
 
       Attribute cLayout;
       for (auto it : llvm::zip(llvm::SmallVector<Type>{cType, dType},
-                              llvm::SmallVector<char>{'c', 'd'})) {
+                               llvm::SmallVector<char>{'c', 'd'})) {
         Type type = std::get<0>(it);
         char name = std::get<1>(it);
         if (auto tensorType = type.dyn_cast<RankedTensorType>()) {
           Attribute encoding = tensorType.getEncoding();
           if (!encoding)
             return dotOp.emitError() << name << " should have encoding";
-          if (!encoding.isa<triton::gpu::TritonGPUMmaEncodingAttr>() && 
+          if (!encoding.isa<triton::gpu::TritonGPUMmaEncodingAttr>() &&
               !encoding.isa<triton::gpu::TritonGPUBlockedEncodingAttr>())
-            return dotOp.emitError() << name << " should be of distributed layout";
+            return dotOp.emitError()
+                   << name << " should be of distributed layout";
           if (name == 'c')
             cLayout = encoding;
           else if (encoding != cLayout)
             return dotOp.emitError() << "d & c should have the same layout";
         } else
-          return dotOp.emitError() << name
-                                   << "'s type should be of RankedTensorType";
+          return dotOp.emitError()
+                 << name << "'s type should be of RankedTensorType";
       }
 
       // signalPassFailure();
@@ -89,7 +90,7 @@ private:
   }
 
   void verifyImpl(Operation *op) {
-    if(verifySingleOp(op).failed())
+    if (verifySingleOp(op).failed())
       signalPassFailure();
 
     // verify that all child regions are ok
diff --git a/lib/driver/dispatch.cc b/lib/driver/dispatch.cc
old mode 100755
new mode 100644
index 9e2aca432..427453b38
--- a/lib/driver/dispatch.cc
+++ b/lib/driver/dispatch.cc
@@ -1,107 +1,152 @@
 /* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
 
 #include "triton/driver/dispatch.h"
 
-namespace triton
-{
-namespace driver
-{
+namespace triton {
+namespace driver {
 
-//Helpers for function definition
-#define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }\
-void* dispatch::fname ## _;
+// Helpers for function definition
+#define DEFINE0(init, hlib, ret, fname)                                        \
+  ret dispatch::fname() {                                                      \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname);              \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }\
-void* dispatch::fname ## _;
+#define DEFINE1(init, hlib, ret, fname, t1)                                    \
+  ret dispatch::fname(t1 a) {                                                  \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a);           \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }\
-void* dispatch::fname ## _;
+#define DEFINE2(init, hlib, ret, fname, t1, t2)                                \
+  ret dispatch::fname(t1 a, t2 b) {                                            \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b);        \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }\
-void* dispatch::fname ## _;
+#define DEFINE3(init, hlib, ret, fname, t1, t2, t3)                            \
+  ret dispatch::fname(t1 a, t2 b, t3 c) {                                      \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c);     \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }\
-void* dispatch::fname ## _;
+#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4)                        \
+  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d) {                                \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d);  \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }\
-void* dispatch::fname ## _;
+#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5)                    \
+  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e) {                          \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
+                                  e);                                          \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }\
-void* dispatch::fname ## _;
+#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6)                \
+  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f) {                    \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
+                                  e, f);                                       \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }\
-void* dispatch::fname ## _;
+#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7)            \
+  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g) {              \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
+                                  e, f, g);                                    \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }\
-void* dispatch::fname ## _;
+#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)        \
+  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h) {        \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
+                                  e, f, g, h);                                 \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }\
-void* dispatch::fname ## _;
+#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)    \
+  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i) {  \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
+                                  e, f, g, h, i);                              \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }\
-void* dispatch::fname ## _;
+#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
+                 t10)                                                          \
+  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
+                      t10 j) {                                                 \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
+                                  e, f, g, h, i, j);                           \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }\
-void* dispatch::fname ## _;
+#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
+                 t10, t11)                                                     \
+  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
+                      t10 j, t11 k) {                                          \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
+                                  e, f, g, h, i, j, k);                        \
+  }                                                                            \
+  void *dispatch::fname##_;
 
-#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }\
-void* dispatch::fname ## _;
-
-#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }\
-void* dispatch::fname ## _;
+#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
+                 t10, t11, t12, t13)                                           \
+  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
+                      t10 j, t11 k, t12 l, t13 m) {                            \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
+                                  e, f, g, h, i, j, k, l, m);                  \
+  }                                                                            \
+  void *dispatch::fname##_;
 
+#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
+                 t10, t11, t12, t13, t14, t15, t16, t17, t18, t19)             \
+  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
+                      t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q,  \
+                      t18 r, t19 s) {                                          \
+    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
+                                  e, f, g, h, i, j, k, l, m, n, o, p, q, r,    \
+                                  s);                                          \
+  }                                                                            \
+  void *dispatch::fname##_;
 
 /* ------------------- *
  * CUDA
  * ------------------- */
 
-bool dispatch::cuinit(){
-  if(cuda_==nullptr){
-    #ifdef _WIN32
+bool dispatch::cuinit() {
+  if (cuda_ == nullptr) {
+#ifdef _WIN32
     cuda_ = dlopen("cudart64_110.dll", RTLD_LAZY);
-    #else
+#else
     cuda_ = dlopen("libcuda.so", RTLD_LAZY);
-    if(!cuda_)
+    if (!cuda_)
       cuda_ = dlopen("libcuda.so.1", RTLD_LAZY);
-    #endif
-    if(!cuda_)
-      throw std::runtime_error("Could not find `libcuda.so`. Make sure it is in your LD_LIBRARY_PATH.");
+#endif
+    if (!cuda_)
+      throw std::runtime_error("Could not find `libcuda.so`. Make sure it is "
+                               "in your LD_LIBRARY_PATH.");
   }
-  if(cuda_ == nullptr)
+  if (cuda_ == nullptr)
     return false;
   CUresult (*fptr)(unsigned int);
   cuInit_ = dlsym(cuda_, "cuInit");
@@ -112,21 +157,33 @@ bool dispatch::cuinit(){
 }
 
 #define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
-#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
-#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
-#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
-#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
-#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
-#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
+#define CUDA_DEFINE2(ret, fname, t1, t2)                                       \
+  DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
+#define CUDA_DEFINE3(ret, fname, t1, t2, t3)                                   \
+  DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
+#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4)                               \
+  DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
+#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5)                           \
+  DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
+#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6)                       \
+  DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
+#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7)                   \
+  DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
+#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)               \
+  DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
+#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)           \
+  DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)     \
+  DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
+#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,     \
+                      t11)                                                     \
+  DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, \
+           t11)
 
 // context management
 CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
 CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
-CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
+CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice *)
 CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
 CUDA_DEFINE1(CUresult, cuInit, unsigned int)
 CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
@@ -134,59 +191,71 @@ CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
 CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
 CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
 CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
-CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
-CUDA_DEFINE1(CUresult, cuDeviceGetCount, int*)
+CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute,
+             CUdevice)
+CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
 
 // link management
-CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
-CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
+CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void *,
+             size_t, const char *, unsigned int, CUjit_option *, void **);
+CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option *, void **,
+             CUlinkState *);
 CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
-CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
+CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void **, size_t *);
 // module management
-CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
+CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr *, size_t *, CUmodule,
+             const char *)
 CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
 CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
 CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
-CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
-CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
+CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *,
+             unsigned int, CUjit_option *, void **)
+CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule,
+             const char *)
 // stream management
 CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
 CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
 CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
-CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
-CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
+CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext *)
+CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int,
+              unsigned int, unsigned int, unsigned int, unsigned int,
+              unsigned int, CUstream, void **, void **)
 // function management
-CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
-CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
+CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int *, CUfunction_attribute,
+             CUfunction)
+CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute,
+             int)
 CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
 // memory management
 CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
 CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
-CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
-CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
-CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
-CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
-CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
-CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
+CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t,
+             CUstream)
+CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t,
+             CUstream)
+CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t)
+CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr *, size_t)
+CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void *, CUpointer_attribute,
+             CUdeviceptr)
+CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t,
+             CUstream)
 // event management
 CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
 CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
 CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
 CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
 
-
-
 /* ------------------- *
  * NVML
  * ------------------- */
-bool dispatch::nvmlinit(){
-  #ifdef _WIN32
-  if(nvml_==nullptr)
+bool dispatch::nvmlinit() {
+#ifdef _WIN32
+  if (nvml_ == nullptr)
     nvml_ = dlopen("nvml.dll", RTLD_LAZY);
-  #else
-  if(nvml_==nullptr)
+#else
+  if (nvml_ == nullptr)
     nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
-  #endif
+#endif
   nvmlReturn_t (*fptr)();
   nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2");
   *reinterpret_cast<void **>(&fptr) = nvmlInit_v2_;
@@ -197,21 +266,27 @@ bool dispatch::nvmlinit(){
 
 #define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
 #define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
-#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
-#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
+#define NVML_DEFINE2(ret, fname, t1, t2)                                       \
+  DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
+#define NVML_DEFINE3(ret, fname, t1, t2, t3)                                   \
+  DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
 
-NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)
+NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *,
+             nvmlDevice_t *)
+NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t,
+             nvmlClockType_t, unsigned int *)
+NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t,
+             nvmlClockType_t, unsigned int *)
+NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t,
+             unsigned int, unsigned int)
 
 /* ------------------- *
  * HIP
  * ------------------- */
-bool dispatch::hipinit(){
-  if(hip_==nullptr)
+bool dispatch::hipinit() {
+  if (hip_ == nullptr)
     hip_ = dlopen("libamdhip64.so", RTLD_LAZY);
-  if(hip_ == nullptr)
+  if (hip_ == nullptr)
     return false;
   hipError_t (*fptr)();
   hipInit_ = dlsym(hip_, "hipInit");
@@ -222,23 +297,34 @@ bool dispatch::hipinit(){
 }
 
 #define HIP_DEFINE1(ret, fname, t1) DEFINE1(hipinit, hip_, ret, fname, t1)
-#define HIP_DEFINE2(ret, fname, t1, t2) DEFINE2(hipinit, hip_, ret, fname, t1, t2)
-#define HIP_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3)
-#define HIP_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4)
-#define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5)
-#define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-#define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
-#define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
+#define HIP_DEFINE2(ret, fname, t1, t2)                                        \
+  DEFINE2(hipinit, hip_, ret, fname, t1, t2)
+#define HIP_DEFINE3(ret, fname, t1, t2, t3)                                    \
+  DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3)
+#define HIP_DEFINE4(ret, fname, t1, t2, t3, t4)                                \
+  DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4)
+#define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5)                            \
+  DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5)
+#define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6)                        \
+  DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6)
+#define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7)                    \
+  DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
+#define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)                \
+  DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
+#define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)            \
+  DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+#define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)      \
+  DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
+#define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
+  DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, \
+           t11)
 
 // context management
 HIP_DEFINE1(hipError_t, hipCtxDestroy, hipCtx_t)
 HIP_DEFINE3(hipError_t, hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t)
-HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t*)
+HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t *)
 HIP_DEFINE1(hipError_t, hipCtxPushCurrent, hipCtx_t)
-HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t*)
+HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t *)
 HIP_DEFINE2(hipError_t, hipCtxEnablePeerAccess, hipCtx_t, unsigned int)
 HIP_DEFINE1(hipError_t, hipInit, unsigned int)
 HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *)
@@ -246,56 +332,64 @@ HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *)
 HIP_DEFINE2(hipError_t, hipGetDevice, hipDevice_t *, int)
 HIP_DEFINE3(hipError_t, hipDeviceGetName, char *, int, hipDevice_t)
 HIP_DEFINE3(hipError_t, hipDeviceGetPCIBusId, char *, int, hipDevice_t)
-HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t, hipDevice_t)
+HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t,
+            hipDevice_t)
 HIP_DEFINE1(hipError_t, hipGetDeviceCount, int *)
 // module management
-HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t*, size_t*, hipModule_t, const char*)
+HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t *, size_t *,
+            hipModule_t, const char *)
 HIP_DEFINE2(hipError_t, hipModuleLoad, hipModule_t *, const char *)
 HIP_DEFINE1(hipError_t, hipModuleUnload, hipModule_t)
 HIP_DEFINE2(hipError_t, hipModuleLoadData, hipModule_t *, const void *)
-HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *, unsigned int, hipJitOption *, void **)
-HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t, const char *)
+HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *,
+            unsigned int, hipJitOption *, void **)
+HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t,
+            const char *)
 // stream management
 HIP_DEFINE2(hipError_t, hipStreamCreate, hipStream_t *, unsigned int)
 HIP_DEFINE1(hipError_t, hipStreamSynchronize, hipStream_t)
 HIP_DEFINE1(hipError_t, hipStreamDestroy, hipStream_t)
-HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, hipStream_t, void **, void **)
+HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int,
+             unsigned int, unsigned int, unsigned int, unsigned int,
+             unsigned int, unsigned int, hipStream_t, void **, void **)
 // function management
-HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes*, void*)
+HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes *, void *)
 HIP_DEFINE2(hipError_t, hipFuncSetCacheConfig, hipFunction_t, hipFuncCache_t)
 // memory management
 HIP_DEFINE3(hipError_t, hipMemcpyDtoH, void *, hipDeviceptr_t, size_t)
 HIP_DEFINE1(hipError_t, hipFree, hipDeviceptr_t)
-HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t, hipStream_t)
-HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *, size_t, hipStream_t)
-HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t )
-HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t*, size_t)
-HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void*, CUpointer_attribute, hipDeviceptr_t)
-HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t, hipStream_t)
+HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t,
+            hipStream_t)
+HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *,
+            size_t, hipStream_t)
+HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t)
+HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t *, size_t)
+HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void *, CUpointer_attribute,
+            hipDeviceptr_t)
+HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t,
+            hipStream_t)
 // event management
 HIP_DEFINE2(hipError_t, hipEventCreate, hipEvent_t *, unsigned int)
 HIP_DEFINE3(hipError_t, hipEventElapsedTime, float *, hipEvent_t, hipEvent_t)
 HIP_DEFINE2(hipError_t, hipEventRecord, hipEvent_t, hipStream_t)
 HIP_DEFINE1(hipError_t, hipEventDestroy, hipEvent_t)
 
-
 /* ------------------- *
  * COMMON
  * ------------------- */
 
 // Release
-void dispatch::release(){
-  if(cuda_){
+void dispatch::release() {
+  if (cuda_) {
     dlclose(cuda_);
     cuda_ = nullptr;
   }
 }
 
-void* dispatch::cuda_;
-void* dispatch::nvml_;
-void* dispatch::nvmlInit_v2_;
-void* dispatch::hip_;
+void *dispatch::cuda_;
+void *dispatch::nvml_;
+void *dispatch::nvmlInit_v2_;
+void *dispatch::hip_;
 
-
-}
-}
+} // namespace driver
+} // namespace triton
diff --git a/lib/driver/error.cc b/lib/driver/error.cc
old mode 100755
new mode 100644
index f723351c2..4b366746e
--- a/lib/driver/error.cc
+++ b/lib/driver/error.cc
@@ -1,166 +1,270 @@
 /* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
 
 #include "triton/driver/error.h"
 
-namespace triton
-{
-namespace driver
-{
+namespace triton {
+namespace driver {
 
-void check(CUresult err)
-{
+void check(CUresult err) {
   using namespace exception::cuda;
-  switch(err)
-  {
-  case CUDA_SUCCESS                              : break;
-  case CUDA_ERROR_INVALID_VALUE                  : throw invalid_value();
-  case CUDA_ERROR_OUT_OF_MEMORY                  : throw out_of_memory();
-  case CUDA_ERROR_NOT_INITIALIZED                : throw not_initialized();
-  case CUDA_ERROR_DEINITIALIZED                  : throw deinitialized();
-  case CUDA_ERROR_PROFILER_DISABLED              : throw profiler_disabled();
-  case CUDA_ERROR_PROFILER_NOT_INITIALIZED       : throw profiler_not_initialized();
-  case CUDA_ERROR_PROFILER_ALREADY_STARTED       : throw profiler_already_started();
-  case CUDA_ERROR_PROFILER_ALREADY_STOPPED       : throw profiler_already_stopped();
-  case CUDA_ERROR_NO_DEVICE                      : throw no_device();
-  case CUDA_ERROR_INVALID_DEVICE                 : throw invalid_device();
-  case CUDA_ERROR_INVALID_IMAGE                  : throw invalid_image();
-  case CUDA_ERROR_INVALID_CONTEXT                : throw invalid_context();
-  case CUDA_ERROR_CONTEXT_ALREADY_CURRENT        : throw context_already_current();
-  case CUDA_ERROR_MAP_FAILED                     : throw map_failed();
-  case CUDA_ERROR_UNMAP_FAILED                   : throw unmap_failed();
-  case CUDA_ERROR_ARRAY_IS_MAPPED                : throw array_is_mapped();
-  case CUDA_ERROR_ALREADY_MAPPED                 : throw already_mapped();
-  case CUDA_ERROR_NO_BINARY_FOR_GPU              : throw no_binary_for_gpu();
-  case CUDA_ERROR_ALREADY_ACQUIRED               : throw already_acquired();
-  case CUDA_ERROR_NOT_MAPPED                     : throw not_mapped();
-  case CUDA_ERROR_NOT_MAPPED_AS_ARRAY            : throw not_mapped_as_array();
-  case CUDA_ERROR_NOT_MAPPED_AS_POINTER          : throw not_mapped_as_pointer();
-  case CUDA_ERROR_ECC_UNCORRECTABLE              : throw ecc_uncorrectable();
-  case CUDA_ERROR_UNSUPPORTED_LIMIT              : throw unsupported_limit();
-  case CUDA_ERROR_CONTEXT_ALREADY_IN_USE         : throw context_already_in_use();
-  case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        : throw peer_access_unsupported();
-  case CUDA_ERROR_INVALID_PTX                    : throw invalid_ptx();
-  case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       : throw invalid_graphics_context();
-  case CUDA_ERROR_INVALID_SOURCE                 : throw invalid_source();
-  case CUDA_ERROR_FILE_NOT_FOUND                 : throw file_not_found();
-  case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND : throw shared_object_symbol_not_found();
-  case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      : throw shared_object_init_failed();
-  case CUDA_ERROR_OPERATING_SYSTEM               : throw operating_system();
-  case CUDA_ERROR_INVALID_HANDLE                 : throw invalid_handle();
-  case CUDA_ERROR_NOT_FOUND                      : throw not_found();
-  case CUDA_ERROR_NOT_READY                      : throw not_ready();
-  case CUDA_ERROR_ILLEGAL_ADDRESS                : throw illegal_address();
-  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        : throw launch_out_of_resources();
-  case CUDA_ERROR_LAUNCH_TIMEOUT                 : throw launch_timeout();
-  case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  : throw launch_incompatible_texturing();
-  case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    : throw peer_access_already_enabled();
-  case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        : throw peer_access_not_enabled();
-  case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         : throw primary_context_active();
-  case CUDA_ERROR_CONTEXT_IS_DESTROYED           : throw context_is_destroyed();
-  case CUDA_ERROR_ASSERT                         : throw assert_error();
-  case CUDA_ERROR_TOO_MANY_PEERS                 : throw too_many_peers();
-  case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED : throw host_memory_already_registered();
-  case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     : throw host_memory_not_registered();
-  case CUDA_ERROR_HARDWARE_STACK_ERROR           : throw hardware_stack_error();
-  case CUDA_ERROR_ILLEGAL_INSTRUCTION            : throw illegal_instruction();
-  case CUDA_ERROR_MISALIGNED_ADDRESS             : throw misaligned_address();
-  case CUDA_ERROR_INVALID_ADDRESS_SPACE          : throw invalid_address_space();
-  case CUDA_ERROR_INVALID_PC                     : throw invalid_pc();
-  case CUDA_ERROR_LAUNCH_FAILED                  : throw launch_failed();
-  case CUDA_ERROR_NOT_PERMITTED                  : throw not_permitted();
-  case CUDA_ERROR_NOT_SUPPORTED                  : throw not_supported();
-  case CUDA_ERROR_UNKNOWN                        : throw unknown();
-  default                                        : throw unknown();
+  switch (err) {
+  case CUDA_SUCCESS:
+    break;
+  case CUDA_ERROR_INVALID_VALUE:
+    throw invalid_value();
+  case CUDA_ERROR_OUT_OF_MEMORY:
+    throw out_of_memory();
+  case CUDA_ERROR_NOT_INITIALIZED:
+    throw not_initialized();
+  case CUDA_ERROR_DEINITIALIZED:
+    throw deinitialized();
+  case CUDA_ERROR_PROFILER_DISABLED:
+    throw profiler_disabled();
+  case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+    throw profiler_not_initialized();
+  case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+    throw profiler_already_started();
+  case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+    throw profiler_already_stopped();
+  case CUDA_ERROR_NO_DEVICE:
+    throw no_device();
+  case CUDA_ERROR_INVALID_DEVICE:
+    throw invalid_device();
+  case CUDA_ERROR_INVALID_IMAGE:
+    throw invalid_image();
+  case CUDA_ERROR_INVALID_CONTEXT:
+    throw invalid_context();
+  case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+    throw context_already_current();
+  case CUDA_ERROR_MAP_FAILED:
+    throw map_failed();
+  case CUDA_ERROR_UNMAP_FAILED:
+    throw unmap_failed();
+  case CUDA_ERROR_ARRAY_IS_MAPPED:
+    throw array_is_mapped();
+  case CUDA_ERROR_ALREADY_MAPPED:
+    throw already_mapped();
+  case CUDA_ERROR_NO_BINARY_FOR_GPU:
+    throw no_binary_for_gpu();
+  case CUDA_ERROR_ALREADY_ACQUIRED:
+    throw already_acquired();
+  case CUDA_ERROR_NOT_MAPPED:
+    throw not_mapped();
+  case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+    throw not_mapped_as_array();
+  case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+    throw not_mapped_as_pointer();
+  case CUDA_ERROR_ECC_UNCORRECTABLE:
+    throw ecc_uncorrectable();
+  case CUDA_ERROR_UNSUPPORTED_LIMIT:
+    throw unsupported_limit();
+  case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+    throw context_already_in_use();
+  case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
+    throw peer_access_unsupported();
+  case CUDA_ERROR_INVALID_PTX:
+    throw invalid_ptx();
+  case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
+    throw invalid_graphics_context();
+  case CUDA_ERROR_INVALID_SOURCE:
+    throw invalid_source();
+  case CUDA_ERROR_FILE_NOT_FOUND:
+    throw file_not_found();
+  case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+    throw shared_object_symbol_not_found();
+  case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+    throw shared_object_init_failed();
+  case CUDA_ERROR_OPERATING_SYSTEM:
+    throw operating_system();
+  case CUDA_ERROR_INVALID_HANDLE:
+    throw invalid_handle();
+  case CUDA_ERROR_NOT_FOUND:
+    throw not_found();
+  case CUDA_ERROR_NOT_READY:
+    throw not_ready();
+  case CUDA_ERROR_ILLEGAL_ADDRESS:
+    throw illegal_address();
+  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+    throw launch_out_of_resources();
+  case CUDA_ERROR_LAUNCH_TIMEOUT:
+    throw launch_timeout();
+  case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
+    throw launch_incompatible_texturing();
+  case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+    throw peer_access_already_enabled();
+  case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+    throw peer_access_not_enabled();
+  case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+    throw primary_context_active();
+  case CUDA_ERROR_CONTEXT_IS_DESTROYED:
+    throw context_is_destroyed();
+  case CUDA_ERROR_ASSERT:
+    throw assert_error();
+  case CUDA_ERROR_TOO_MANY_PEERS:
+    throw too_many_peers();
+  case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+    throw host_memory_already_registered();
+  case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+    throw host_memory_not_registered();
+  case CUDA_ERROR_HARDWARE_STACK_ERROR:
+    throw hardware_stack_error();
+  case CUDA_ERROR_ILLEGAL_INSTRUCTION:
+    throw illegal_instruction();
+  case CUDA_ERROR_MISALIGNED_ADDRESS:
+    throw misaligned_address();
+  case CUDA_ERROR_INVALID_ADDRESS_SPACE:
+    throw invalid_address_space();
+  case CUDA_ERROR_INVALID_PC:
+    throw invalid_pc();
+  case CUDA_ERROR_LAUNCH_FAILED:
+    throw launch_failed();
+  case CUDA_ERROR_NOT_PERMITTED:
+    throw not_permitted();
+  case CUDA_ERROR_NOT_SUPPORTED:
+    throw not_supported();
+  case CUDA_ERROR_UNKNOWN:
+    throw unknown();
+  default:
+    throw unknown();
   }
 }
 
 void check(hipError_t error) {
   using namespace exception::hip;
-  switch(error)
-  {
-  case hipSuccess                              : break;
-    case hipErrorInvalidValue                  : throw invalid_value();
-    case hipErrorMemoryAllocation                  : throw out_of_memory();
-    case hipErrorNotInitialized                : throw not_initialized();
-    case hipErrorDeinitialized                  : throw deinitialized();
-    case hipErrorProfilerDisabled              : throw profiler_disabled();
-    case hipErrorProfilerNotInitialized       : throw profiler_not_initialized();
-    case hipErrorProfilerAlreadyStarted       : throw profiler_already_started();
-    case hipErrorProfilerAlreadyStopped       : throw profiler_already_stopped();
-    case hipErrorNoDevice                      : throw no_device();
-    case hipErrorInvalidSymbol                      : throw invalid_symbol();
-    case hipErrorInvalidDevice                 : throw invalid_device();
-    case hipErrorInvalidImage                  : throw invalid_image();
-    case hipErrorInvalidContext                : throw invalid_context();
-    case hipErrorContextAlreadyCurrent        : throw context_already_current();
-    case hipErrorMapFailed                     : throw map_failed();
-    case hipErrorUnmapFailed                   : throw unmap_failed();
-    case hipErrorArrayIsMapped                : throw array_is_mapped();
-    case hipErrorAlreadyMapped                 : throw already_mapped();
-    case hipErrorNoBinaryForGpu              : throw no_binary_for_gpu();
-    case hipErrorAlreadyAcquired               : throw already_acquired();
-    case hipErrorNotMapped                     : throw not_mapped();
-    case hipErrorNotMappedAsArray             : throw not_mapped_as_array();
-    case hipErrorNotMappedAsPointer           : throw not_mapped_as_pointer();
-    case hipErrorECCNotCorrectable            : throw ecc_uncorrectable();
-    case hipErrorUnsupportedLimit             : throw unsupported_limit();
-    case hipErrorContextAlreadyInUse          : throw context_already_in_use();
-    case hipErrorPeerAccessUnsupported        : throw peer_access_unsupported();
-    case hipErrorInvalidKernelFile            : throw invalid_ptx();
-    case hipErrorInvalidGraphicsContext       : throw invalid_graphics_context();
-    case hipErrorInvalidSource                 : throw invalid_source();
-    case hipErrorFileNotFound                 : throw file_not_found();
-    case hipErrorSharedObjectSymbolNotFound : throw shared_object_symbol_not_found();
-    case hipErrorSharedObjectInitFailed      : throw shared_object_init_failed();
-    case hipErrorOperatingSystem               : throw operating_system();
-    case hipErrorInvalidResourceHandle                 : throw invalid_handle();
-    case hipErrorNotFound                      : throw not_found();
-    case hipErrorNotReady                      : throw not_ready();
-    case hipErrorIllegalAddress                : throw illegal_address();
-    case hipErrorLaunchOutOfResources        : throw launch_out_of_resources();
-    case hipErrorLaunchTimeOut                 : throw launch_timeout();
-    // case hipErrorLaunchIncompatibleTexturing  : throw launch_incompatible_texturing();
-    case hipErrorPeerAccessAlreadyEnabled    : throw peer_access_already_enabled();
-    case hipErrorPeerAccessNotEnabled        : throw peer_access_not_enabled();
-    // case hipErrorPrimaryContextActive         : throw primary_context_active();
-    // case hipErrorContextIsDestroyed           : throw context_is_destroyed();
-    case hipErrorAssert                         : throw assert_error();
-    // case hipErrorTooManyPeers                 : throw too_many_peers();
-    case hipErrorHostMemoryAlreadyRegistered : throw host_memory_already_registered();
-    case hipErrorHostMemoryNotRegistered     : throw host_memory_not_registered();
-    // case hipErrorHardwareStackError           : throw hardware_stack_error();
-    // case hipErrorIllegalInstruction            : throw illegal_instruction();
-    // case hipErrorMisalignedAddress             : throw misaligned_address();
-    // case hipErrorInvalidAddressSpace          : throw invalid_address_space();
-    // case hipErrorInvalidPc                     : throw invalid_pc();
-    case hipErrorLaunchFailure                  : throw launch_failed();
-    // case hipErrorNotPermitted                  : throw not_permitted();
-    case hipErrorNotSupported                  : throw not_supported();
-    case hipErrorUnknown                        : throw unknown();
-    default                                        : throw unknown();
-}
-}
-
-}
+  switch (error) {
+  case hipSuccess:
+    break;
+  case hipErrorInvalidValue:
+    throw invalid_value();
+  case hipErrorMemoryAllocation:
+    throw out_of_memory();
+  case hipErrorNotInitialized:
+    throw not_initialized();
+  case hipErrorDeinitialized:
+    throw deinitialized();
+  case hipErrorProfilerDisabled:
+    throw profiler_disabled();
+  case hipErrorProfilerNotInitialized:
+    throw profiler_not_initialized();
+  case hipErrorProfilerAlreadyStarted:
+    throw profiler_already_started();
+  case hipErrorProfilerAlreadyStopped:
+    throw profiler_already_stopped();
+  case hipErrorNoDevice:
+    throw no_device();
+  case hipErrorInvalidSymbol:
+    throw invalid_symbol();
+  case hipErrorInvalidDevice:
+    throw invalid_device();
+  case hipErrorInvalidImage:
+    throw invalid_image();
+  case hipErrorInvalidContext:
+    throw invalid_context();
+  case hipErrorContextAlreadyCurrent:
+    throw context_already_current();
+  case hipErrorMapFailed:
+    throw map_failed();
+  case hipErrorUnmapFailed:
+    throw unmap_failed();
+  case hipErrorArrayIsMapped:
+    throw array_is_mapped();
+  case hipErrorAlreadyMapped:
+    throw already_mapped();
+  case hipErrorNoBinaryForGpu:
+    throw no_binary_for_gpu();
+  case hipErrorAlreadyAcquired:
+    throw already_acquired();
+  case hipErrorNotMapped:
+    throw not_mapped();
+  case hipErrorNotMappedAsArray:
+    throw not_mapped_as_array();
+  case hipErrorNotMappedAsPointer:
+    throw not_mapped_as_pointer();
+  case hipErrorECCNotCorrectable:
+    throw ecc_uncorrectable();
+  case hipErrorUnsupportedLimit:
+    throw unsupported_limit();
+  case hipErrorContextAlreadyInUse:
+    throw context_already_in_use();
+  case hipErrorPeerAccessUnsupported:
+    throw peer_access_unsupported();
+  case hipErrorInvalidKernelFile:
+    throw invalid_ptx();
+  case hipErrorInvalidGraphicsContext:
+    throw invalid_graphics_context();
+  case hipErrorInvalidSource:
+    throw invalid_source();
+  case hipErrorFileNotFound:
+    throw file_not_found();
+  case hipErrorSharedObjectSymbolNotFound:
+    throw shared_object_symbol_not_found();
+  case hipErrorSharedObjectInitFailed:
+    throw shared_object_init_failed();
+  case hipErrorOperatingSystem:
+    throw operating_system();
+  case hipErrorInvalidResourceHandle:
+    throw invalid_handle();
+  case hipErrorNotFound:
+    throw not_found();
+  case hipErrorNotReady:
+    throw not_ready();
+  case hipErrorIllegalAddress:
+    throw illegal_address();
+  case hipErrorLaunchOutOfResources:
+    throw launch_out_of_resources();
+  case hipErrorLaunchTimeOut:
+    throw launch_timeout();
+  // case hipErrorLaunchIncompatibleTexturing  : throw
+  // launch_incompatible_texturing();
+  case hipErrorPeerAccessAlreadyEnabled:
+    throw peer_access_already_enabled();
+  case hipErrorPeerAccessNotEnabled:
+    throw peer_access_not_enabled();
+  // case hipErrorPrimaryContextActive         : throw primary_context_active();
+  // case hipErrorContextIsDestroyed           : throw context_is_destroyed();
+  case hipErrorAssert:
+    throw assert_error();
+  // case hipErrorTooManyPeers                 : throw too_many_peers();
+  case hipErrorHostMemoryAlreadyRegistered:
+    throw host_memory_already_registered();
+  case hipErrorHostMemoryNotRegistered:
+    throw host_memory_not_registered();
+  // case hipErrorHardwareStackError           : throw hardware_stack_error();
+  // case hipErrorIllegalInstruction            : throw illegal_instruction();
+  // case hipErrorMisalignedAddress             : throw misaligned_address();
+  // case hipErrorInvalidAddressSpace          : throw invalid_address_space();
+  // case hipErrorInvalidPc                     : throw invalid_pc();
+  case hipErrorLaunchFailure:
+    throw launch_failed();
+  // case hipErrorNotPermitted                  : throw not_permitted();
+  case hipErrorNotSupported:
+    throw not_supported();
+  case hipErrorUnknown:
+    throw unknown();
+  default:
+    throw unknown();
+  }
 }
 
+} // namespace driver
+} // namespace triton
diff --git a/lib/driver/llvm.cc b/lib/driver/llvm.cc
index ee82c467e..f78e9c8e9 100644
--- a/lib/driver/llvm.cc
+++ b/lib/driver/llvm.cc
@@ -1,73 +1,73 @@
 /* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
 #include <fstream>
 #if __has_include(<unistd.h>)
-    #include <unistd.h>
+#include <unistd.h>
 #endif
-#include <memory>
-#include <regex>
-#include "triton/driver/llvm.h"
 #include "triton/driver/dispatch.h"
 #include "triton/driver/error.h"
+#include "triton/driver/llvm.h"
 #include "triton/tools/sha1.hpp"
+#include "triton/tools/sys/exec.hpp"
 #include "triton/tools/sys/getenv.hpp"
 #include "triton/tools/sys/mkdir.hpp"
-#include "triton/tools/sys/exec.hpp"
-#include "llvm/MC/TargetRegistry.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Verifier.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <memory>
+#include <regex>
 
 // begin AMD stuff
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 // end AMD stuff
 
-extern "C"{
-  int set_curterm(char* nterm){ return 0; }
-  int del_curterm(char* nterm){ return 0; }
-  int tigetnum(char *capname) { return 0; }
-  int setupterm(char *term, int fildes, int *errret) { return 0; }
+extern "C" {
+int set_curterm(char *nterm) { return 0; }
+int del_curterm(char *nterm) { return 0; }
+int tigetnum(char *capname) { return 0; }
+int setupterm(char *term, int fildes, int *errret) { return 0; }
 }
 
-namespace triton{
-namespace driver{
+namespace triton {
+namespace driver {
 
 void init_llvm() {
   LLVMInitializeNVPTXTargetInfo();
@@ -80,82 +80,93 @@ void init_llvm() {
   LLVMInitializeAMDGPUAsmPrinter();
 }
 
-
 /* ------------------------ */
 //         CUDA             //
 /* ------------------------ */
-static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
+static bool find_and_replace(std::string &str, const std::string &begin,
+                             const std::string &end,
+                             const std::string &target) {
   size_t start_replace = str.find(begin);
   size_t end_replace = str.find(end, start_replace);
-  if(start_replace == std::string::npos)
+  if (start_replace == std::string::npos)
     return false;
   str.replace(start_replace, end_replace + 1 - start_replace, target);
   return true;
 }
 
-std::string path_to_ptxas(int& version) {
+std::string path_to_ptxas(int &version) {
   std::vector<std::string> rets;
   std::string ret;
   // search pathes for ptxas
   std::vector<std::string> ptxas_prefixes = {"", "/usr/local/cuda/bin/"};
   std::string triton_ptxas = tools::getenv("TRITON_PTXAS_PATH");
-  if(!triton_ptxas.empty())
+  if (!triton_ptxas.empty())
     ptxas_prefixes.insert(ptxas_prefixes.begin(), triton_ptxas);
   // see what path for ptxas are valid
   std::vector<std::string> working_ptxas;
-  for(std::string prefix: ptxas_prefixes){
+  for (std::string prefix : ptxas_prefixes) {
     std::string ptxas = prefix + "ptxas";
     bool works = tools::exec(ptxas + " --version 2>&1", ret) == 0;
-    if(works) {
+    if (works) {
       working_ptxas.push_back(ptxas);
       rets.push_back(ret);
     }
   }
   // error if no working ptxas was found
-  if(working_ptxas.empty())
-    throw std::runtime_error("`ptxas` was searched in TRITON_PTXAS_PATH, /usr/local/cuda/bin/ or PATH"
+  if (working_ptxas.empty())
+    throw std::runtime_error("`ptxas` was searched in TRITON_PTXAS_PATH, "
+                             "/usr/local/cuda/bin/ or PATH"
                              " but a working version could not be found.");
   std::string ptxas = working_ptxas.front();
   // parse version
   std::regex version_regex("release (\\d+)\\.(\\d+)");
   std::smatch match;
   bool found = false;
-  // currently choosing the first ptxas. Other logics can be implemented in future
-  for(std::string ret : rets) {
-    if(std::regex_search(ret, match, version_regex)){
+  // currently choosing the first ptxas. Other logics can be implemented in
+  // future
+  for (std::string ret : rets) {
+    if (std::regex_search(ret, match, version_regex)) {
       int major = std::stoi(match[1]);
       int minor = std::stoi(match[2]);
-      version = major*1000 + minor*10;
+      version = major * 1000 + minor * 10;
       found = true;
       break;
     }
   }
-  if ( not found) {
+  if (not found) {
     throw std::runtime_error("Error in parsing version");
   }
   return ptxas;
 }
 
-
-int vptx(int version){
-  if(version >= 11040) return 74;
-  if(version >= 11030) return 73;
-  if(version >= 11020) return 72;
-  if(version >= 11010) return 71;
-  if(version >= 11000) return 70;
-  if(version >= 10020) return 65;
-  if(version >= 10010) return 64;
-  if(version >= 10000) return 63;
+int vptx(int version) {
+  if (version >= 11040)
+    return 74;
+  if (version >= 11030)
+    return 73;
+  if (version >= 11020)
+    return 72;
+  if (version >= 11010)
+    return 71;
+  if (version >= 11000)
+    return 70;
+  if (version >= 10020)
+    return 65;
+  if (version >= 10010)
+    return 64;
+  if (version >= 10000)
+    return 63;
   throw std::runtime_error("Triton requires CUDA 10+");
 }
 
-std::string llir_to_ptx(llvm::Module* module, int cc, int version){
+std::string llir_to_ptx(llvm::Module *module, int cc, int version) {
   // LLVM version in use may not officially support target hardware
   int max_nvvm_cc = 75;
   int max_nvvm_ptx = 74;
   // options
   auto options = llvm::cl::getRegisteredOptions();
-  auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
+  auto *short_ptr =
+      static_cast<llvm::cl::opt<bool> *>(options["nvptx-short-ptr"]);
   assert(short_ptr);
   short_ptr->setValue(true);
   // compute capability
@@ -170,7 +181,8 @@ std::string llir_to_ptx(llvm::Module* module, int cc, int version){
   std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
   std::string layout = "";
   std::string features = "";
-  // std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
+  // std::string features = "+ptx" + std::to_string(std::min(ptx,
+  // max_nvvm_ptx));
   init_llvm();
   // verify and store llvm
   llvm::legacy::PassManager pm;
@@ -181,16 +193,18 @@ std::string llir_to_ptx(llvm::Module* module, int cc, int version){
   // create machine
   module->setTargetTriple(triple);
   std::string error;
-  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
+  auto target =
+      llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
   llvm::TargetOptions opt;
   opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
   opt.UnsafeFPMath = false;
   opt.NoInfsFPMath = false;
   opt.NoNaNsFPMath = true;
-  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
-                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
+  llvm::TargetMachine *machine = target->createTargetMachine(
+      module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_,
+      llvm::None, llvm::CodeGenOpt::Aggressive);
   // set data layout
-  if(layout.empty())
+  if (layout.empty())
     module->setDataLayout(machine->createDataLayout());
   else
     module->setDataLayout(layout);
@@ -200,19 +214,25 @@ std::string llir_to_ptx(llvm::Module* module, int cc, int version){
   llvm::legacy::PassManager pass;
   llvm::raw_svector_ostream stream(buffer);
   // emit
-  machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
+  machine->addPassesToEmitFile(pass, stream, nullptr,
+                               llvm::CodeGenFileType::CGFT_AssemblyFile);
   pass.run(*module);
 
   // post-process
   std::string result(buffer.begin(), buffer.end());
-  find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
+  find_and_replace(result, ".version", "\n",
+                   ".version " + std::to_string(ptx_major) + "." +
+                       std::to_string(ptx_minor) + "\n");
   find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
-  while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
-  while(find_and_replace(result, "\t// end inline asm", "\n", ""));
+  while (find_and_replace(result, "\t// begin inline asm", "\n", ""))
+    ;
+  while (find_and_replace(result, "\t// end inline asm", "\n", ""))
+    ;
   return result;
 }
 
-std::string ptx_to_cubin(const std::string& ptx, const std::string& ptxas, int cc) {
+std::string ptx_to_cubin(const std::string &ptx, const std::string &ptxas,
+                         int cc) {
   // compile ptx with ptxas
   char _fsrc[L_tmpnam];
   char _flog[L_tmpnam];
@@ -221,15 +241,16 @@ std::string ptx_to_cubin(const std::string& ptx, const std::string& ptxas, int c
   std::string fsrc = _fsrc;
   std::string flog = _flog;
   std::string fbin = fsrc + ".o";
-  const char* _fbin = fbin.c_str();
+  const char *_fbin = fbin.c_str();
   std::ofstream ofs(fsrc);
   ofs << ptx << std::endl;
   ofs.close();
   std::string cmd;
   int err;
-  cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
+  cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc +
+        " -o " + fsrc + ".o 2> " + flog;
   err = system(cmd.c_str());
-  if(err != 0){
+  if (err != 0) {
     std::ifstream _log(_flog);
     std::string log(std::istreambuf_iterator<char>(_log), {});
     unlink(_fsrc);
@@ -237,7 +258,7 @@ std::string ptx_to_cubin(const std::string& ptx, const std::string& ptxas, int c
     throw std::runtime_error("Internal Triton PTX codegen error: \n" + log);
   }
   CUmodule ret;
-  std::ifstream _cubin(_fbin, std::ios::binary );
+  std::ifstream _cubin(_fbin, std::ios::binary);
   std::string cubin(std::istreambuf_iterator<char>(_cubin), {});
   _cubin.close();
   unlink(_fsrc);
@@ -251,11 +272,11 @@ std::string ptx_to_cubin(const std::string& ptx, const std::string& ptxas, int c
 //         HIP              //
 /* ------------------------ */
 
-std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) {
+std::string llir_to_amdgpu(llvm::Module *module, const std::string &_proc) {
   init_llvm();
 
-//  proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo));
-//  features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo));
+  //  proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo));
+  //  features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo));
 
   // create
   llvm::SmallVector<char, 0> buffer;
@@ -270,17 +291,18 @@ std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) {
   // create machine
   module->setTargetTriple(triple);
   std::string error;
-  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
+  auto target =
+      llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
   llvm::TargetOptions opt;
   opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
   opt.UnsafeFPMath = false;
   opt.NoInfsFPMath = false;
   opt.NoNaNsFPMath = true;
-  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
-                                                             llvm::Reloc::PIC_, llvm::None,
-                                                             llvm::CodeGenOpt::Aggressive);
+  llvm::TargetMachine *machine = target->createTargetMachine(
+      module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_,
+      llvm::None, llvm::CodeGenOpt::Aggressive);
   // set data layout
-  if(layout.empty())
+  if (layout.empty())
     module->setDataLayout(machine->createDataLayout());
   else
     module->setDataLayout(layout);
@@ -295,33 +317,37 @@ std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) {
   std::error_code ec;
 
   // Save GCN ISA binary.
-  std::string isabin_path = std::string("/tmp/") + module_name + std::string(".o");
+  std::string isabin_path =
+      std::string("/tmp/") + module_name + std::string(".o");
   std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
       new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
-  if (ec)
-  {
-    std::cout << isabin_path << " was not created. error code: " << ec << std::endl;
+  if (ec) {
+    std::cout << isabin_path << " was not created. error code: " << ec
+              << std::endl;
   }
 
   // emit
-  machine->addPassesToEmitFile(pass, *isabin_fs, nullptr, llvm::CGFT_ObjectFile);
+  machine->addPassesToEmitFile(pass, *isabin_fs, nullptr,
+                               llvm::CGFT_ObjectFile);
   pass.run(*module);
   // Save GCN ISA.
-  std::string amdgcn_path = std::string("/tmp/") + module_name + std::string(".gcn");
+  std::string amdgcn_path =
+      std::string("/tmp/") + module_name + std::string(".gcn");
   std::string result(buffer.begin(), buffer.end());
   std::ofstream amdgcn(amdgcn_path);
   amdgcn << result;
   amdgcn.close();
 
   // generate HASCO file
-  std::string hsaco_path = std::string("/tmp/") + module_name + std::string(".hsaco");
+  std::string hsaco_path =
+      std::string("/tmp/") + module_name + std::string(".hsaco");
   std::string error_message;
   int lld_result =
       llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld",
-                                {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu", "-shared", "-o", hsaco_path, isabin_path},
+                                {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu",
+                                 "-shared", "-o", hsaco_path, isabin_path},
                                 llvm::None, {}, 0, 0, &error_message);
-  if (lld_result)
-  {
+  if (lld_result) {
     std::cout << "ld.lld execute fail: " << std::endl;
     std::cout << error_message << std::endl;
     std::cout << lld_result << std::endl;
@@ -330,33 +356,29 @@ std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) {
   return hsaco_path;
 }
 
-
-hipModule_t amdgpu_to_hipmodule(const std::string& path) {
+hipModule_t amdgpu_to_hipmodule(const std::string &path) {
   // Read HSACO.
   std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate);
   std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
 
   std::vector<unsigned char> hsaco(hsaco_file_size);
   hsaco_file.seekg(0, std::ios::beg);
-  hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
+  hsaco_file.read(reinterpret_cast<char *>(&hsaco[0]), hsaco_file_size);
   hsaco_file.close();
-  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, hipJitOptionErrorLogBuffer,
-                            hipJitOptionInfoLogBufferSizeBytes, hipJitOptionInfoLogBuffer,
-                            hipJitOptionLogVerbose};
+  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes,
+                        hipJitOptionErrorLogBuffer,
+                        hipJitOptionInfoLogBufferSizeBytes,
+                        hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose};
   const unsigned int errbufsize = 8192;
   const unsigned int logbufsize = 8192;
   char _err[errbufsize];
   char _log[logbufsize];
-  void* optval[] = {(void*)(uintptr_t)errbufsize,
-                    (void*)_err, (void*)(uintptr_t)logbufsize,
-                    (void*)_log, (void*)1};
+  void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err,
+                    (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1};
   hipModule_t ret;
   dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval);
   return ret;
 }
 
-
-
-}
-}
-
+} // namespace driver
+} // namespace triton
diff --git a/python/src/pybind11/attr.h b/python/src/pybind11/attr.h
index 6962d6fc5..eada4e1f1 100644
--- a/python/src/pybind11/attr.h
+++ b/python/src/pybind11/attr.h
@@ -18,60 +18,83 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 /// @{
 
 /// Annotation for methods
-struct is_method { handle class_; is_method(const handle &c) : class_(c) { } };
+struct is_method {
+  handle class_;
+  is_method(const handle &c) : class_(c) {}
+};
 
 /// Annotation for operators
-struct is_operator { };
+struct is_operator {};
 
 /// Annotation for parent scope
-struct scope { handle value; scope(const handle &s) : value(s) { } };
+struct scope {
+  handle value;
+  scope(const handle &s) : value(s) {}
+};
 
 /// Annotation for documentation
-struct doc { const char *value; doc(const char *value) : value(value) { } };
+struct doc {
+  const char *value;
+  doc(const char *value) : value(value) {}
+};
 
 /// Annotation for function names
-struct name { const char *value; name(const char *value) : value(value) { } };
+struct name {
+  const char *value;
+  name(const char *value) : value(value) {}
+};
 
-/// Annotation indicating that a function is an overload associated with a given "sibling"
-struct sibling { handle value; sibling(const handle &value) : value(value.ptr()) { } };
+/// Annotation indicating that a function is an overload associated with a given
+/// "sibling"
+struct sibling {
+  handle value;
+  sibling(const handle &value) : value(value.ptr()) {}
+};
 
 /// Annotation indicating that a class derives from another given type
 template <typename T> struct base {
-    PYBIND11_DEPRECATED("base<T>() was deprecated in favor of specifying 'T' as a template argument to class_")
-    base() { }
+  PYBIND11_DEPRECATED("base<T>() was deprecated in favor of specifying 'T' as "
+                      "a template argument to class_")
+  base() {}
 };
 
 /// Keep patient alive while nurse lives
-template <size_t Nurse, size_t Patient> struct keep_alive { };
+template <size_t Nurse, size_t Patient> struct keep_alive {};
 
-/// Annotation indicating that a class is involved in a multiple inheritance relationship
-struct multiple_inheritance { };
+/// Annotation indicating that a class is involved in a multiple inheritance
+/// relationship
+struct multiple_inheritance {};
 
 /// Annotation which enables dynamic attributes, i.e. adds `__dict__` to a class
-struct dynamic_attr { };
+struct dynamic_attr {};
 
 /// Annotation which enables the buffer protocol for a type
-struct buffer_protocol { };
+struct buffer_protocol {};
 
 /// Annotation which requests that a special metaclass is created for a type
 struct metaclass {
-    handle value;
+  handle value;
 
-    PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
-    metaclass() {}
+  PYBIND11_DEPRECATED(
+      "py::metaclass() is no longer required. It's turned on by default now.")
+  metaclass() {}
 
-    /// Override pybind11's default metaclass
-    explicit metaclass(handle value) : value(value) { }
+  /// Override pybind11's default metaclass
+  explicit metaclass(handle value) : value(value) {}
 };
 
 /// Annotation that marks a class as local to the module:
-struct module_local { const bool value; constexpr module_local(bool v = true) : value(v) { } };
+struct module_local {
+  const bool value;
+  constexpr module_local(bool v = true) : value(v) {}
+};
 
 /// Annotation to mark enums as an arithmetic type
-struct arithmetic { };
+struct arithmetic {};
 
 /** \rst
-    A call policy which places one or more guard variables (``Ts...``) around the function call.
+    A call policy which places one or more guard variables (``Ts...``) around
+ the function call.
 
     For example, this definition:
 
@@ -92,20 +115,19 @@ template <typename... Ts> struct call_guard;
 
 template <> struct call_guard<> { using type = detail::void_type; };
 
-template <typename T>
-struct call_guard<T> {
-    static_assert(std::is_default_constructible<T>::value,
-                  "The guard type must be default constructible");
+template <typename T> struct call_guard<T> {
+  static_assert(std::is_default_constructible<T>::value,
+                "The guard type must be default constructible");
 
-    using type = T;
+  using type = T;
 };
 
-template <typename T, typename... Ts>
-struct call_guard<T, Ts...> {
-    struct type {
-        T guard{}; // Compose multiple guard types with left-to-right default-constructor order
-        typename call_guard<Ts...>::type next{};
-    };
+template <typename T, typename... Ts> struct call_guard<T, Ts...> {
+  struct type {
+    T guard{}; // Compose multiple guard types with left-to-right
+               // default-constructor order
+    typename call_guard<Ts...>::type next{};
+  };
 };
 
 /// @} annotations
@@ -115,181 +137,190 @@ NAMESPACE_BEGIN(detail)
 enum op_id : int;
 enum op_type : int;
 struct undefined_t;
-template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t> struct op_;
-inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
+template <op_id id, op_type ot, typename L = undefined_t,
+          typename R = undefined_t>
+struct op_;
+inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call,
+                            handle ret);
 
 /// Internal data structure which holds metadata about a keyword argument
 struct argument_record {
-    const char *name;  ///< Argument name
-    const char *descr; ///< Human-readable version of the argument value
-    handle value;      ///< Associated Python object
-    bool convert : 1;  ///< True if the argument is allowed to convert when loading
-    bool none : 1;     ///< True if None is allowed when loading
+  const char *name;  ///< Argument name
+  const char *descr; ///< Human-readable version of the argument value
+  handle value;      ///< Associated Python object
+  bool convert : 1; ///< True if the argument is allowed to convert when loading
+  bool none : 1;    ///< True if None is allowed when loading
 
-    argument_record(const char *name, const char *descr, handle value, bool convert, bool none)
-        : name(name), descr(descr), value(value), convert(convert), none(none) { }
+  argument_record(const char *name, const char *descr, handle value,
+                  bool convert, bool none)
+      : name(name), descr(descr), value(value), convert(convert), none(none) {}
 };
 
-/// Internal data structure which holds metadata about a bound function (signature, overloads, etc.)
+/// Internal data structure which holds metadata about a bound function
+/// (signature, overloads, etc.)
 struct function_record {
-    function_record()
-        : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
-          is_operator(false), has_args(false), has_kwargs(false), is_method(false) { }
+  function_record()
+      : is_constructor(false), is_new_style_constructor(false),
+        is_stateless(false), is_operator(false), has_args(false),
+        has_kwargs(false), is_method(false) {}
 
-    /// Function name
-    char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
+  /// Function name
+  char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
 
-    // User-specified documentation string
-    char *doc = nullptr;
+  // User-specified documentation string
+  char *doc = nullptr;
 
-    /// Human-readable version of the function signature
-    char *signature = nullptr;
+  /// Human-readable version of the function signature
+  char *signature = nullptr;
 
-    /// List of registered keyword arguments
-    std::vector<argument_record> args;
+  /// List of registered keyword arguments
+  std::vector<argument_record> args;
 
-    /// Pointer to lambda function which converts arguments and performs the actual call
-    handle (*impl) (function_call &) = nullptr;
+  /// Pointer to lambda function which converts arguments and performs the
+  /// actual call
+  handle (*impl)(function_call &) = nullptr;
 
-    /// Storage for the wrapped function pointer and captured data, if any
-    void *data[3] = { };
+  /// Storage for the wrapped function pointer and captured data, if any
+  void *data[3] = {};
 
-    /// Pointer to custom destructor for 'data' (if needed)
-    void (*free_data) (function_record *ptr) = nullptr;
+  /// Pointer to custom destructor for 'data' (if needed)
+  void (*free_data)(function_record *ptr) = nullptr;
 
-    /// Return value policy associated with this function
-    return_value_policy policy = return_value_policy::automatic;
+  /// Return value policy associated with this function
+  return_value_policy policy = return_value_policy::automatic;
 
-    /// True if name == '__init__'
-    bool is_constructor : 1;
+  /// True if name == '__init__'
+  bool is_constructor : 1;
 
-    /// True if this is a new-style `__init__` defined in `detail/init.h`
-    bool is_new_style_constructor : 1;
+  /// True if this is a new-style `__init__` defined in `detail/init.h`
+  bool is_new_style_constructor : 1;
 
-    /// True if this is a stateless function pointer
-    bool is_stateless : 1;
+  /// True if this is a stateless function pointer
+  bool is_stateless : 1;
 
-    /// True if this is an operator (__add__), etc.
-    bool is_operator : 1;
+  /// True if this is an operator (__add__), etc.
+  bool is_operator : 1;
 
-    /// True if the function has a '*args' argument
-    bool has_args : 1;
+  /// True if the function has a '*args' argument
+  bool has_args : 1;
 
-    /// True if the function has a '**kwargs' argument
-    bool has_kwargs : 1;
+  /// True if the function has a '**kwargs' argument
+  bool has_kwargs : 1;
 
-    /// True if this is a method
-    bool is_method : 1;
+  /// True if this is a method
+  bool is_method : 1;
 
-    /// Number of arguments (including py::args and/or py::kwargs, if present)
-    std::uint16_t nargs;
+  /// Number of arguments (including py::args and/or py::kwargs, if present)
+  std::uint16_t nargs;
 
-    /// Python method object
-    PyMethodDef *def = nullptr;
+  /// Python method object
+  PyMethodDef *def = nullptr;
 
-    /// Python handle to the parent scope (a class or a module)
-    handle scope;
+  /// Python handle to the parent scope (a class or a module)
+  handle scope;
 
-    /// Python handle to the sibling function representing an overload chain
-    handle sibling;
+  /// Python handle to the sibling function representing an overload chain
+  handle sibling;
 
-    /// Pointer to next overload
-    function_record *next = nullptr;
+  /// Pointer to next overload
+  function_record *next = nullptr;
 };
 
-/// Special data structure which (temporarily) holds metadata about a bound class
+/// Special data structure which (temporarily) holds metadata about a bound
+/// class
 struct type_record {
-    PYBIND11_NOINLINE type_record()
-        : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false),
-          default_holder(true), module_local(false) { }
+  PYBIND11_NOINLINE type_record()
+      : multiple_inheritance(false), dynamic_attr(false),
+        buffer_protocol(false), default_holder(true), module_local(false) {}
 
-    /// Handle to the parent scope
-    handle scope;
+  /// Handle to the parent scope
+  handle scope;
 
-    /// Name of the class
-    const char *name = nullptr;
+  /// Name of the class
+  const char *name = nullptr;
 
-    // Pointer to RTTI type_info data structure
-    const std::type_info *type = nullptr;
+  // Pointer to RTTI type_info data structure
+  const std::type_info *type = nullptr;
 
-    /// How large is the underlying C++ type?
-    size_t type_size = 0;
+  /// How large is the underlying C++ type?
+  size_t type_size = 0;
 
-    /// What is the alignment of the underlying C++ type?
-    size_t type_align = 0;
+  /// What is the alignment of the underlying C++ type?
+  size_t type_align = 0;
 
-    /// How large is the type's holder?
-    size_t holder_size = 0;
+  /// How large is the type's holder?
+  size_t holder_size = 0;
 
-    /// The global operator new can be overridden with a class-specific variant
-    void *(*operator_new)(size_t) = nullptr;
+  /// The global operator new can be overridden with a class-specific variant
+  void *(*operator_new)(size_t) = nullptr;
 
-    /// Function pointer to class_<..>::init_instance
-    void (*init_instance)(instance *, const void *) = nullptr;
+  /// Function pointer to class_<..>::init_instance
+  void (*init_instance)(instance *, const void *) = nullptr;
 
-    /// Function pointer to class_<..>::dealloc
-    void (*dealloc)(detail::value_and_holder &) = nullptr;
+  /// Function pointer to class_<..>::dealloc
+  void (*dealloc)(detail::value_and_holder &) = nullptr;
 
-    /// List of base classes of the newly created type
-    list bases;
+  /// List of base classes of the newly created type
+  list bases;
 
-    /// Optional docstring
-    const char *doc = nullptr;
+  /// Optional docstring
+  const char *doc = nullptr;
 
-    /// Custom metaclass (optional)
-    handle metaclass;
+  /// Custom metaclass (optional)
+  handle metaclass;
 
-    /// Multiple inheritance marker
-    bool multiple_inheritance : 1;
+  /// Multiple inheritance marker
+  bool multiple_inheritance : 1;
 
-    /// Does the class manage a __dict__?
-    bool dynamic_attr : 1;
+  /// Does the class manage a __dict__?
+  bool dynamic_attr : 1;
 
-    /// Does the class implement the buffer protocol?
-    bool buffer_protocol : 1;
+  /// Does the class implement the buffer protocol?
+  bool buffer_protocol : 1;
 
-    /// Is the default (unique_ptr) holder type used?
-    bool default_holder : 1;
+  /// Is the default (unique_ptr) holder type used?
+  bool default_holder : 1;
 
-    /// Is the class definition local to the module shared object?
-    bool module_local : 1;
+  /// Is the class definition local to the module shared object?
+  bool module_local : 1;
 
-    PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *)) {
-        auto base_info = detail::get_type_info(base, false);
-        if (!base_info) {
-            std::string tname(base.name());
-            detail::clean_type_id(tname);
-            pybind11_fail("generic_type: type \"" + std::string(name) +
-                          "\" referenced unknown base type \"" + tname + "\"");
-        }
-
-        if (default_holder != base_info->default_holder) {
-            std::string tname(base.name());
-            detail::clean_type_id(tname);
-            pybind11_fail("generic_type: type \"" + std::string(name) + "\" " +
-                    (default_holder ? "does not have" : "has") +
-                    " a non-default holder type while its base \"" + tname + "\" " +
-                    (base_info->default_holder ? "does not" : "does"));
-        }
-
-        bases.append((PyObject *) base_info->type);
-
-        if (base_info->type->tp_dictoffset != 0)
-            dynamic_attr = true;
-
-        if (caster)
-            base_info->implicit_casts.emplace_back(type, caster);
+  PYBIND11_NOINLINE void add_base(const std::type_info &base,
+                                  void *(*caster)(void *)) {
+    auto base_info = detail::get_type_info(base, false);
+    if (!base_info) {
+      std::string tname(base.name());
+      detail::clean_type_id(tname);
+      pybind11_fail("generic_type: type \"" + std::string(name) +
+                    "\" referenced unknown base type \"" + tname + "\"");
     }
+
+    if (default_holder != base_info->default_holder) {
+      std::string tname(base.name());
+      detail::clean_type_id(tname);
+      pybind11_fail("generic_type: type \"" + std::string(name) + "\" " +
+                    (default_holder ? "does not have" : "has") +
+                    " a non-default holder type while its base \"" + tname +
+                    "\" " + (base_info->default_holder ? "does not" : "does"));
+    }
+
+    bases.append((PyObject *)base_info->type);
+
+    if (base_info->type->tp_dictoffset != 0)
+      dynamic_attr = true;
+
+    if (caster)
+      base_info->implicit_casts.emplace_back(type, caster);
+  }
 };
 
-inline function_call::function_call(const function_record &f, handle p) :
-        func(f), parent(p) {
-    args.reserve(f.nargs);
-    args_convert.reserve(f.nargs);
+inline function_call::function_call(const function_record &f, handle p)
+    : func(f), parent(p) {
+  args.reserve(f.nargs);
+  args_convert.reserve(f.nargs);
 }
 
 /// Tag for a new-style `__init__` defined in `detail/init.h`
-struct is_new_style_constructor { };
+struct is_new_style_constructor {};
 
 /**
  * Partial template specializations to process custom attributes provided to
@@ -300,135 +331,191 @@ struct is_new_style_constructor { };
 template <typename T, typename SFINAE = void> struct process_attribute;
 
 template <typename T> struct process_attribute_default {
-    /// Default implementation: do nothing
-    static void init(const T &, function_record *) { }
-    static void init(const T &, type_record *) { }
-    static void precall(function_call &) { }
-    static void postcall(function_call &, handle) { }
+  /// Default implementation: do nothing
+  static void init(const T &, function_record *) {}
+  static void init(const T &, type_record *) {}
+  static void precall(function_call &) {}
+  static void postcall(function_call &, handle) {}
 };
 
 /// Process an attribute specifying the function's name
 template <> struct process_attribute<name> : process_attribute_default<name> {
-    static void init(const name &n, function_record *r) { r->name = const_cast<char *>(n.value); }
+  static void init(const name &n, function_record *r) {
+    r->name = const_cast<char *>(n.value);
+  }
 };
 
 /// Process an attribute specifying the function's docstring
 template <> struct process_attribute<doc> : process_attribute_default<doc> {
-    static void init(const doc &n, function_record *r) { r->doc = const_cast<char *>(n.value); }
+  static void init(const doc &n, function_record *r) {
+    r->doc = const_cast<char *>(n.value);
+  }
 };
 
-/// Process an attribute specifying the function's docstring (provided as a C-style string)
-template <> struct process_attribute<const char *> : process_attribute_default<const char *> {
-    static void init(const char *d, function_record *r) { r->doc = const_cast<char *>(d); }
-    static void init(const char *d, type_record *r) { r->doc = const_cast<char *>(d); }
+/// Process an attribute specifying the function's docstring (provided as a
+/// C-style string)
+template <>
+struct process_attribute<const char *>
+    : process_attribute_default<const char *> {
+  static void init(const char *d, function_record *r) {
+    r->doc = const_cast<char *>(d);
+  }
+  static void init(const char *d, type_record *r) {
+    r->doc = const_cast<char *>(d);
+  }
 };
-template <> struct process_attribute<char *> : process_attribute<const char *> { };
+template <>
+struct process_attribute<char *> : process_attribute<const char *> {};
 
 /// Process an attribute indicating the function's return value policy
-template <> struct process_attribute<return_value_policy> : process_attribute_default<return_value_policy> {
-    static void init(const return_value_policy &p, function_record *r) { r->policy = p; }
+template <>
+struct process_attribute<return_value_policy>
+    : process_attribute_default<return_value_policy> {
+  static void init(const return_value_policy &p, function_record *r) {
+    r->policy = p;
+  }
 };
 
-/// Process an attribute which indicates that this is an overloaded function associated with a given sibling
-template <> struct process_attribute<sibling> : process_attribute_default<sibling> {
-    static void init(const sibling &s, function_record *r) { r->sibling = s.value; }
+/// Process an attribute which indicates that this is an overloaded function
+/// associated with a given sibling
+template <>
+struct process_attribute<sibling> : process_attribute_default<sibling> {
+  static void init(const sibling &s, function_record *r) {
+    r->sibling = s.value;
+  }
 };
 
 /// Process an attribute which indicates that this function is a method
-template <> struct process_attribute<is_method> : process_attribute_default<is_method> {
-    static void init(const is_method &s, function_record *r) { r->is_method = true; r->scope = s.class_; }
+template <>
+struct process_attribute<is_method> : process_attribute_default<is_method> {
+  static void init(const is_method &s, function_record *r) {
+    r->is_method = true;
+    r->scope = s.class_;
+  }
 };
 
 /// Process an attribute which indicates the parent scope of a method
 template <> struct process_attribute<scope> : process_attribute_default<scope> {
-    static void init(const scope &s, function_record *r) { r->scope = s.value; }
+  static void init(const scope &s, function_record *r) { r->scope = s.value; }
 };
 
 /// Process an attribute which indicates that this function is an operator
-template <> struct process_attribute<is_operator> : process_attribute_default<is_operator> {
-    static void init(const is_operator &, function_record *r) { r->is_operator = true; }
+template <>
+struct process_attribute<is_operator> : process_attribute_default<is_operator> {
+  static void init(const is_operator &, function_record *r) {
+    r->is_operator = true;
+  }
 };
 
-template <> struct process_attribute<is_new_style_constructor> : process_attribute_default<is_new_style_constructor> {
-    static void init(const is_new_style_constructor &, function_record *r) { r->is_new_style_constructor = true; }
+template <>
+struct process_attribute<is_new_style_constructor>
+    : process_attribute_default<is_new_style_constructor> {
+  static void init(const is_new_style_constructor &, function_record *r) {
+    r->is_new_style_constructor = true;
+  }
 };
 
 /// Process a keyword argument attribute (*without* a default value)
 template <> struct process_attribute<arg> : process_attribute_default<arg> {
-    static void init(const arg &a, function_record *r) {
-        if (r->is_method && r->args.empty())
-            r->args.emplace_back("self", nullptr, handle(), true /*convert*/, false /*none not allowed*/);
-        r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
-    }
+  static void init(const arg &a, function_record *r) {
+    if (r->is_method && r->args.empty())
+      r->args.emplace_back("self", nullptr, handle(), true /*convert*/,
+                           false /*none not allowed*/);
+    r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert,
+                         a.flag_none);
+  }
 };
 
 /// Process a keyword argument attribute (*with* a default value)
 template <> struct process_attribute<arg_v> : process_attribute_default<arg_v> {
-    static void init(const arg_v &a, function_record *r) {
-        if (r->is_method && r->args.empty())
-            r->args.emplace_back("self", nullptr /*descr*/, handle() /*parent*/, true /*convert*/, false /*none not allowed*/);
+  static void init(const arg_v &a, function_record *r) {
+    if (r->is_method && r->args.empty())
+      r->args.emplace_back("self", nullptr /*descr*/, handle() /*parent*/,
+                           true /*convert*/, false /*none not allowed*/);
 
-        if (!a.value) {
+    if (!a.value) {
 #if !defined(NDEBUG)
-            std::string descr("'");
-            if (a.name) descr += std::string(a.name) + ": ";
-            descr += a.type + "'";
-            if (r->is_method) {
-                if (r->name)
-                    descr += " in method '" + (std::string) str(r->scope) + "." + (std::string) r->name + "'";
-                else
-                    descr += " in method of '" + (std::string) str(r->scope) + "'";
-            } else if (r->name) {
-                descr += " in function '" + (std::string) r->name + "'";
-            }
-            pybind11_fail("arg(): could not convert default argument "
-                          + descr + " into a Python object (type not registered yet?)");
+      std::string descr("'");
+      if (a.name)
+        descr += std::string(a.name) + ": ";
+      descr += a.type + "'";
+      if (r->is_method) {
+        if (r->name)
+          descr += " in method '" + (std::string)str(r->scope) + "." +
+                   (std::string)r->name + "'";
+        else
+          descr += " in method of '" + (std::string)str(r->scope) + "'";
+      } else if (r->name) {
+        descr += " in function '" + (std::string)r->name + "'";
+      }
+      pybind11_fail("arg(): could not convert default argument " + descr +
+                    " into a Python object (type not registered yet?)");
 #else
-            pybind11_fail("arg(): could not convert default argument "
-                          "into a Python object (type not registered yet?). "
-                          "Compile in debug mode for more information.");
+      pybind11_fail("arg(): could not convert default argument "
+                    "into a Python object (type not registered yet?). "
+                    "Compile in debug mode for more information.");
 #endif
-        }
-        r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
     }
+    r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert,
+                         a.flag_none);
+  }
 };
 
-/// Process a parent class attribute.  Single inheritance only (class_ itself already guarantees that)
+/// Process a parent class attribute.  Single inheritance only (class_ itself
+/// already guarantees that)
 template <typename T>
-struct process_attribute<T, enable_if_t<is_pyobject<T>::value>> : process_attribute_default<handle> {
-    static void init(const handle &h, type_record *r) { r->bases.append(h); }
+struct process_attribute<T, enable_if_t<is_pyobject<T>::value>>
+    : process_attribute_default<handle> {
+  static void init(const handle &h, type_record *r) { r->bases.append(h); }
 };
 
-/// Process a parent class attribute (deprecated, does not support multiple inheritance)
+/// Process a parent class attribute (deprecated, does not support multiple
+/// inheritance)
 template <typename T>
 struct process_attribute<base<T>> : process_attribute_default<base<T>> {
-    static void init(const base<T> &, type_record *r) { r->add_base(typeid(T), nullptr); }
+  static void init(const base<T> &, type_record *r) {
+    r->add_base(typeid(T), nullptr);
+  }
 };
 
 /// Process a multiple inheritance attribute
 template <>
-struct process_attribute<multiple_inheritance> : process_attribute_default<multiple_inheritance> {
-    static void init(const multiple_inheritance &, type_record *r) { r->multiple_inheritance = true; }
+struct process_attribute<multiple_inheritance>
+    : process_attribute_default<multiple_inheritance> {
+  static void init(const multiple_inheritance &, type_record *r) {
+    r->multiple_inheritance = true;
+  }
 };
 
 template <>
-struct process_attribute<dynamic_attr> : process_attribute_default<dynamic_attr> {
-    static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; }
+struct process_attribute<dynamic_attr>
+    : process_attribute_default<dynamic_attr> {
+  static void init(const dynamic_attr &, type_record *r) {
+    r->dynamic_attr = true;
+  }
 };
 
 template <>
-struct process_attribute<buffer_protocol> : process_attribute_default<buffer_protocol> {
-    static void init(const buffer_protocol &, type_record *r) { r->buffer_protocol = true; }
+struct process_attribute<buffer_protocol>
+    : process_attribute_default<buffer_protocol> {
+  static void init(const buffer_protocol &, type_record *r) {
+    r->buffer_protocol = true;
+  }
 };
 
 template <>
 struct process_attribute<metaclass> : process_attribute_default<metaclass> {
-    static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; }
+  static void init(const metaclass &m, type_record *r) {
+    r->metaclass = m.value;
+  }
 };
 
 template <>
-struct process_attribute<module_local> : process_attribute_default<module_local> {
-    static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+struct process_attribute<module_local>
+    : process_attribute_default<module_local> {
+  static void init(const module_local &l, type_record *r) {
+    r->module_local = l.value;
+  }
 };
 
 /// Process an 'arithmetic' attribute for enums (does nothing here)
@@ -436,57 +523,78 @@ template <>
 struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
 
 template <typename... Ts>
-struct process_attribute<call_guard<Ts...>> : process_attribute_default<call_guard<Ts...>> { };
+struct process_attribute<call_guard<Ts...>>
+    : process_attribute_default<call_guard<Ts...>> {};
 
 /**
  * Process a keep_alive call policy -- invokes keep_alive_impl during the
  * pre-call handler if both Nurse, Patient != 0 and use the post-call handler
  * otherwise
  */
-template <size_t Nurse, size_t Patient> struct process_attribute<keep_alive<Nurse, Patient>> : public process_attribute_default<keep_alive<Nurse, Patient>> {
-    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
-    static void precall(function_call &call) { keep_alive_impl(Nurse, Patient, call, handle()); }
-    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
-    static void postcall(function_call &, handle) { }
-    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
-    static void precall(function_call &) { }
-    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
-    static void postcall(function_call &call, handle ret) { keep_alive_impl(Nurse, Patient, call, ret); }
+template <size_t Nurse, size_t Patient>
+struct process_attribute<keep_alive<Nurse, Patient>>
+    : public process_attribute_default<keep_alive<Nurse, Patient>> {
+  template <size_t N = Nurse, size_t P = Patient,
+            enable_if_t<N != 0 && P != 0, int> = 0>
+  static void precall(function_call &call) {
+    keep_alive_impl(Nurse, Patient, call, handle());
+  }
+  template <size_t N = Nurse, size_t P = Patient,
+            enable_if_t<N != 0 && P != 0, int> = 0>
+  static void postcall(function_call &, handle) {}
+  template <size_t N = Nurse, size_t P = Patient,
+            enable_if_t<N == 0 || P == 0, int> = 0>
+  static void precall(function_call &) {}
+  template <size_t N = Nurse, size_t P = Patient,
+            enable_if_t<N == 0 || P == 0, int> = 0>
+  static void postcall(function_call &call, handle ret) {
+    keep_alive_impl(Nurse, Patient, call, ret);
+  }
 };
 
 /// Recursively iterate over variadic template arguments
 template <typename... Args> struct process_attributes {
-    static void init(const Args&... args, function_record *r) {
-        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
-        ignore_unused(unused);
-    }
-    static void init(const Args&... args, type_record *r) {
-        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
-        ignore_unused(unused);
-    }
-    static void precall(function_call &call) {
-        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::precall(call), 0) ... };
-        ignore_unused(unused);
-    }
-    static void postcall(function_call &call, handle fn_ret) {
-        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0) ... };
-        ignore_unused(unused);
-    }
+  static void init(const Args &... args, function_record *r) {
+    int unused[] = {
+        0, (process_attribute<typename std::decay<Args>::type>::init(args, r),
+            0)...};
+    ignore_unused(unused);
+  }
+  static void init(const Args &... args, type_record *r) {
+    int unused[] = {
+        0, (process_attribute<typename std::decay<Args>::type>::init(args, r),
+            0)...};
+    ignore_unused(unused);
+  }
+  static void precall(function_call &call) {
+    int unused[] = {
+        0, (process_attribute<typename std::decay<Args>::type>::precall(call),
+            0)...};
+    ignore_unused(unused);
+  }
+  static void postcall(function_call &call, handle fn_ret) {
+    int unused[] = {
+        0, (process_attribute<typename std::decay<Args>::type>::postcall(
+                call, fn_ret),
+            0)...};
+    ignore_unused(unused);
+  }
 };
 
-template <typename T>
-using is_call_guard = is_instantiation<call_guard, T>;
+template <typename T> using is_call_guard = is_instantiation<call_guard, T>;
 
-/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found)
+/// Extract the ``type`` from the first `call_guard` in `Extras...` (or
+/// `void_type` if none found)
 template <typename... Extra>
-using extract_guard_t = typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
+using extract_guard_t =
+    typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
 
 /// Check the number of named arguments at compile time
 template <typename... Extra,
           size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
-          size_t self  = constexpr_sum(std::is_same<is_method, Extra>::value...)>
+          size_t self = constexpr_sum(std::is_same<is_method, Extra>::value...)>
 constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
-    return named == 0 || (self + named + has_args + has_kwargs) == nargs;
+  return named == 0 || (self + named + has_args + has_kwargs) == nargs;
 }
 
 NAMESPACE_END(detail)
diff --git a/python/src/pybind11/buffer_info.h b/python/src/pybind11/buffer_info.h
index 9f072fa73..3c080140c 100644
--- a/python/src/pybind11/buffer_info.h
+++ b/python/src/pybind11/buffer_info.h
@@ -15,93 +15,112 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
 /// Information record describing a Python buffer object
 struct buffer_info {
-    void *ptr = nullptr;          // Pointer to the underlying storage
-    ssize_t itemsize = 0;         // Size of individual items in bytes
-    ssize_t size = 0;             // Total number of entries
-    std::string format;           // For homogeneous buffers, this should be set to format_descriptor<T>::format()
-    ssize_t ndim = 0;             // Number of dimensions
-    std::vector<ssize_t> shape;   // Shape of the tensor (1 entry per dimension)
-    std::vector<ssize_t> strides; // Number of entries between adjacent entries (for each per dimension)
+  void *ptr = nullptr;        // Pointer to the underlying storage
+  ssize_t itemsize = 0;       // Size of individual items in bytes
+  ssize_t size = 0;           // Total number of entries
+  std::string format;         // For homogeneous buffers, this should be set to
+                              // format_descriptor<T>::format()
+  ssize_t ndim = 0;           // Number of dimensions
+  std::vector<ssize_t> shape; // Shape of the tensor (1 entry per dimension)
+  std::vector<ssize_t> strides; // Number of entries between adjacent entries
+                                // (for each per dimension)
 
-    buffer_info() { }
+  buffer_info() {}
 
-    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
-                detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in)
-    : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
-      shape(std::move(shape_in)), strides(std::move(strides_in)) {
-        if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size())
-            pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length");
-        for (size_t i = 0; i < (size_t) ndim; ++i)
-            size *= shape[i];
-    }
-
-    template <typename T>
-    buffer_info(T *ptr, detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in)
-    : buffer_info(private_ctr_tag(), ptr, sizeof(T), format_descriptor<T>::format(), static_cast<ssize_t>(shape_in->size()), std::move(shape_in), std::move(strides_in)) { }
-
-    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t size)
-    : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}) { }
-
-    template <typename T>
-    buffer_info(T *ptr, ssize_t size)
-    : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size) { }
-
-    explicit buffer_info(Py_buffer *view, bool ownview = true)
-    : buffer_info(view->buf, view->itemsize, view->format, view->ndim,
-            {view->shape, view->shape + view->ndim}, {view->strides, view->strides + view->ndim}) {
-        this->view = view;
-        this->ownview = ownview;
-    }
-
-    buffer_info(const buffer_info &) = delete;
-    buffer_info& operator=(const buffer_info &) = delete;
-
-    buffer_info(buffer_info &&other) {
-        (*this) = std::move(other);
-    }
-
-    buffer_info& operator=(buffer_info &&rhs) {
-        ptr = rhs.ptr;
-        itemsize = rhs.itemsize;
-        size = rhs.size;
-        format = std::move(rhs.format);
-        ndim = rhs.ndim;
-        shape = std::move(rhs.shape);
-        strides = std::move(rhs.strides);
-        std::swap(view, rhs.view);
-        std::swap(ownview, rhs.ownview);
-        return *this;
-    }
-
-    ~buffer_info() {
-        if (view && ownview) { PyBuffer_Release(view); delete view; }
+  buffer_info(void *ptr, ssize_t itemsize, const std::string &format,
+              ssize_t ndim, detail::any_container<ssize_t> shape_in,
+              detail::any_container<ssize_t> strides_in)
+      : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
+        shape(std::move(shape_in)), strides(std::move(strides_in)) {
+    if (ndim != (ssize_t)shape.size() || ndim != (ssize_t)strides.size())
+      pybind11_fail(
+          "buffer_info: ndim doesn't match shape and/or strides length");
+    for (size_t i = 0; i < (size_t)ndim; ++i)
+      size *= shape[i];
+  }
+
+  template <typename T>
+  buffer_info(T *ptr, detail::any_container<ssize_t> shape_in,
+              detail::any_container<ssize_t> strides_in)
+      : buffer_info(private_ctr_tag(), ptr, sizeof(T),
+                    format_descriptor<T>::format(),
+                    static_cast<ssize_t>(shape_in->size()), std::move(shape_in),
+                    std::move(strides_in)) {}
+
+  buffer_info(void *ptr, ssize_t itemsize, const std::string &format,
+              ssize_t size)
+      : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}) {}
+
+  template <typename T>
+  buffer_info(T *ptr, ssize_t size)
+      : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size) {}
+
+  explicit buffer_info(Py_buffer *view, bool ownview = true)
+      : buffer_info(view->buf, view->itemsize, view->format, view->ndim,
+                    {view->shape, view->shape + view->ndim},
+                    {view->strides, view->strides + view->ndim}) {
+    this->view = view;
+    this->ownview = ownview;
+  }
+
+  buffer_info(const buffer_info &) = delete;
+  buffer_info &operator=(const buffer_info &) = delete;
+
+  buffer_info(buffer_info &&other) { (*this) = std::move(other); }
+
+  buffer_info &operator=(buffer_info &&rhs) {
+    ptr = rhs.ptr;
+    itemsize = rhs.itemsize;
+    size = rhs.size;
+    format = std::move(rhs.format);
+    ndim = rhs.ndim;
+    shape = std::move(rhs.shape);
+    strides = std::move(rhs.strides);
+    std::swap(view, rhs.view);
+    std::swap(ownview, rhs.ownview);
+    return *this;
+  }
+
+  ~buffer_info() {
+    if (view && ownview) {
+      PyBuffer_Release(view);
+      delete view;
     }
+  }
 
 private:
-    struct private_ctr_tag { };
+  struct private_ctr_tag {};
 
-    buffer_info(private_ctr_tag, void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
-                detail::any_container<ssize_t> &&shape_in, detail::any_container<ssize_t> &&strides_in)
-    : buffer_info(ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in)) { }
+  buffer_info(private_ctr_tag, void *ptr, ssize_t itemsize,
+              const std::string &format, ssize_t ndim,
+              detail::any_container<ssize_t> &&shape_in,
+              detail::any_container<ssize_t> &&strides_in)
+      : buffer_info(ptr, itemsize, format, ndim, std::move(shape_in),
+                    std::move(strides_in)) {}
 
-    Py_buffer *view = nullptr;
-    bool ownview = false;
+  Py_buffer *view = nullptr;
+  bool ownview = false;
 };
 
 NAMESPACE_BEGIN(detail)
 
 template <typename T, typename SFINAE = void> struct compare_buffer_info {
-    static bool compare(const buffer_info& b) {
-        return b.format == format_descriptor<T>::format() && b.itemsize == (ssize_t) sizeof(T);
-    }
+  static bool compare(const buffer_info &b) {
+    return b.format == format_descriptor<T>::format() &&
+           b.itemsize == (ssize_t)sizeof(T);
+  }
 };
 
-template <typename T> struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
-    static bool compare(const buffer_info& b) {
-        return (size_t) b.itemsize == sizeof(T) && (b.format == format_descriptor<T>::value ||
-            ((sizeof(T) == sizeof(long)) && b.format == (std::is_unsigned<T>::value ? "L" : "l")) ||
-            ((sizeof(T) == sizeof(size_t)) && b.format == (std::is_unsigned<T>::value ? "N" : "n")));
-    }
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
+  static bool compare(const buffer_info &b) {
+    return (size_t)b.itemsize == sizeof(T) &&
+           (b.format == format_descriptor<T>::value ||
+            ((sizeof(T) == sizeof(long)) &&
+             b.format == (std::is_unsigned<T>::value ? "L" : "l")) ||
+            ((sizeof(T) == sizeof(size_t)) &&
+             b.format == (std::is_unsigned<T>::value ? "N" : "n")));
+  }
 };
 
 NAMESPACE_END(detail)
diff --git a/python/src/pybind11/cast.h b/python/src/pybind11/cast.h
index 8d0fd5d90..49a9adb29 100644
--- a/python/src/pybind11/cast.h
+++ b/python/src/pybind11/cast.h
@@ -10,23 +10,23 @@
 
 #pragma once
 
-#include "pytypes.h"
-#include "detail/typeid.h"
 #include "detail/descr.h"
 #include "detail/internals.h"
+#include "detail/typeid.h"
+#include "pytypes.h"
 #include <array>
 #include <limits>
 #include <tuple>
 #include <type_traits>
 
 #if defined(PYBIND11_CPP17)
-#  if defined(__has_include)
-#    if __has_include(<string_view>)
-#      define PYBIND11_HAS_STRING_VIEW
-#    endif
-#  elif defined(_MSC_VER)
-#    define PYBIND11_HAS_STRING_VIEW
-#  endif
+#if defined(__has_include)
+#if __has_include(<string_view>)
+#define PYBIND11_HAS_STRING_VIEW
+#endif
+#elif defined(_MSC_VER)
+#define PYBIND11_HAS_STRING_VIEW
+#endif
 #endif
 #ifdef PYBIND11_HAS_STRING_VIEW
 #include <string_view>
@@ -35,444 +35,486 @@
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
-/// A life support system for temporary objects created by `type_caster::load()`.
-/// Adding a patient will keep it alive up until the enclosing function returns.
+/// A life support system for temporary objects created by
+/// `type_caster::load()`. Adding a patient will keep it alive up until the
+/// enclosing function returns.
 class loader_life_support {
 public:
-    /// A new patient frame is created when a function is entered
-    loader_life_support() {
-        get_internals().loader_patient_stack.push_back(nullptr);
-    }
-
-    /// ... and destroyed after it returns
-    ~loader_life_support() {
-        auto &stack = get_internals().loader_patient_stack;
-        if (stack.empty())
-            pybind11_fail("loader_life_support: internal error");
-
-        auto ptr = stack.back();
-        stack.pop_back();
-        Py_CLEAR(ptr);
-
-        // A heuristic to reduce the stack's capacity (e.g. after long recursive calls)
-        if (stack.capacity() > 16 && stack.size() != 0 && stack.capacity() / stack.size() > 2)
-            stack.shrink_to_fit();
-    }
-
-    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
-    /// at argument preparation time or by `py::cast()` at execution time.
-    PYBIND11_NOINLINE static void add_patient(handle h) {
-        auto &stack = get_internals().loader_patient_stack;
-        if (stack.empty())
-            throw cast_error("When called outside a bound function, py::cast() cannot "
-                             "do Python -> C++ conversions which require the creation "
-                             "of temporary values");
-
-        auto &list_ptr = stack.back();
-        if (list_ptr == nullptr) {
-            list_ptr = PyList_New(1);
-            if (!list_ptr)
-                pybind11_fail("loader_life_support: error allocating list");
-            PyList_SET_ITEM(list_ptr, 0, h.inc_ref().ptr());
-        } else {
-            auto result = PyList_Append(list_ptr, h.ptr());
-            if (result == -1)
-                pybind11_fail("loader_life_support: error adding patient");
-        }
+  /// A new patient frame is created when a function is entered
+  loader_life_support() {
+    get_internals().loader_patient_stack.push_back(nullptr);
+  }
+
+  /// ... and destroyed after it returns
+  ~loader_life_support() {
+    auto &stack = get_internals().loader_patient_stack;
+    if (stack.empty())
+      pybind11_fail("loader_life_support: internal error");
+
+    auto ptr = stack.back();
+    stack.pop_back();
+    Py_CLEAR(ptr);
+
+    // A heuristic to reduce the stack's capacity (e.g. after long recursive
+    // calls)
+    if (stack.capacity() > 16 && stack.size() != 0 &&
+        stack.capacity() / stack.size() > 2)
+      stack.shrink_to_fit();
+  }
+
+  /// This can only be used inside a pybind11-bound function, either by
+  /// `argument_loader` at argument preparation time or by `py::cast()` at
+  /// execution time.
+  PYBIND11_NOINLINE static void add_patient(handle h) {
+    auto &stack = get_internals().loader_patient_stack;
+    if (stack.empty())
+      throw cast_error(
+          "When called outside a bound function, py::cast() cannot "
+          "do Python -> C++ conversions which require the creation "
+          "of temporary values");
+
+    auto &list_ptr = stack.back();
+    if (list_ptr == nullptr) {
+      list_ptr = PyList_New(1);
+      if (!list_ptr)
+        pybind11_fail("loader_life_support: error allocating list");
+      PyList_SET_ITEM(list_ptr, 0, h.inc_ref().ptr());
+    } else {
+      auto result = PyList_Append(list_ptr, h.ptr());
+      if (result == -1)
+        pybind11_fail("loader_life_support: error adding patient");
     }
+  }
 };
 
-// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
-// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
-// just created.
-inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type);
+// Gets the cache entry for the given type, creating it if necessary.  The
+// return value is the pair returned by emplace, i.e. an iterator for the entry
+// and a bool set to `true` if the entry was just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool>
+all_type_info_get_cache(PyTypeObject *type);
 
 // Populates a just-created cache entry.
-PYBIND11_NOINLINE inline void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
-    std::vector<PyTypeObject *> check;
-    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases))
-        check.push_back((PyTypeObject *) parent.ptr());
+PYBIND11_NOINLINE inline void
+all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+  std::vector<PyTypeObject *> check;
+  for (handle parent : reinterpret_borrow<tuple>(t->tp_bases))
+    check.push_back((PyTypeObject *)parent.ptr());
 
-    auto const &type_dict = get_internals().registered_types_py;
-    for (size_t i = 0; i < check.size(); i++) {
-        auto type = check[i];
-        // Ignore Python2 old-style class super type:
-        if (!PyType_Check((PyObject *) type)) continue;
+  auto const &type_dict = get_internals().registered_types_py;
+  for (size_t i = 0; i < check.size(); i++) {
+    auto type = check[i];
+    // Ignore Python2 old-style class super type:
+    if (!PyType_Check((PyObject *)type))
+      continue;
 
-        // Check `type` in the current set of registered python types:
-        auto it = type_dict.find(type);
-        if (it != type_dict.end()) {
-            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
-            // pybind bases, but we have to make sure we haven't already seen the type(s) before: we
-            // want to follow Python/virtual C++ rules that there should only be one instance of a
-            // common base.
-            for (auto *tinfo : it->second) {
-                // NB: Could use a second set here, rather than doing a linear search, but since
-                // having a large number of immediate pybind11-registered types seems fairly
-                // unlikely, that probably isn't worthwhile.
-                bool found = false;
-                for (auto *known : bases) {
-                    if (known == tinfo) { found = true; break; }
-                }
-                if (!found) bases.push_back(tinfo);
-            }
-        }
-        else if (type->tp_bases) {
-            // It's some python type, so keep follow its bases classes to look for one or more
-            // registered types
-            if (i + 1 == check.size()) {
-                // When we're at the end, we can pop off the current element to avoid growing
-                // `check` when adding just one base (which is typical--i.e. when there is no
-                // multiple inheritance)
-                check.pop_back();
-                i--;
-            }
-            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases))
-                check.push_back((PyTypeObject *) parent.ptr());
+    // Check `type` in the current set of registered python types:
+    auto it = type_dict.find(type);
+    if (it != type_dict.end()) {
+      // We found a cache entry for it, so it's either pybind-registered or has
+      // pre-computed pybind bases, but we have to make sure we haven't already
+      // seen the type(s) before: we want to follow Python/virtual C++ rules
+      // that there should only be one instance of a common base.
+      for (auto *tinfo : it->second) {
+        // NB: Could use a second set here, rather than doing a linear search,
+        // but since having a large number of immediate pybind11-registered
+        // types seems fairly unlikely, that probably isn't worthwhile.
+        bool found = false;
+        for (auto *known : bases) {
+          if (known == tinfo) {
+            found = true;
+            break;
+          }
         }
+        if (!found)
+          bases.push_back(tinfo);
+      }
+    } else if (type->tp_bases) {
+      // It's some python type, so keep follow its bases classes to look for one
+      // or more registered types
+      if (i + 1 == check.size()) {
+        // When we're at the end, we can pop off the current element to avoid
+        // growing `check` when adding just one base (which is typical--i.e.
+        // when there is no multiple inheritance)
+        check.pop_back();
+        i--;
+      }
+      for (handle parent : reinterpret_borrow<tuple>(type->tp_bases))
+        check.push_back((PyTypeObject *)parent.ptr());
     }
+  }
 }
 
 /**
- * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
- * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
- * derived class that uses single inheritance.  Will contain as many types as required for a Python
- * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
- * pybind-registered classes.  Will be empty if neither the type nor any base classes are
- * pybind-registered.
+ * Extracts vector of type_info pointers of pybind-registered roots of the given
+ * Python type.  Will be just 1 pybind type for the Python type of a
+ * pybind-registered class, or for any Python-side derived class that uses
+ * single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from
+ * multiple pybind-registered classes.  Will be empty if neither the type nor
+ * any base classes are pybind-registered.
  *
  * The value is cached for the lifetime of the Python type.
  */
-inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
-    auto ins = all_type_info_get_cache(type);
-    if (ins.second)
-        // New cache entry: populate it
-        all_type_info_populate(type, ins.first->second);
+inline const std::vector<detail::type_info *> &
+all_type_info(PyTypeObject *type) {
+  auto ins = all_type_info_get_cache(type);
+  if (ins.second)
+    // New cache entry: populate it
+    all_type_info_populate(type, ins.first->second);
 
-    return ins.first->second;
+  return ins.first->second;
 }
 
 /**
- * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
- * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
- * `all_type_info` instead if you want to support multiple bases.
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if
+ * neither the type nor any ancestors are pybind11-registered.  Throws an
+ * exception if there are multiple bases--use `all_type_info` instead if you
+ * want to support multiple bases.
  */
-PYBIND11_NOINLINE inline detail::type_info* get_type_info(PyTypeObject *type) {
-    auto &bases = all_type_info(type);
-    if (bases.size() == 0)
-        return nullptr;
-    if (bases.size() > 1)
-        pybind11_fail("pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
-    return bases.front();
+PYBIND11_NOINLINE inline detail::type_info *get_type_info(PyTypeObject *type) {
+  auto &bases = all_type_info(type);
+  if (bases.size() == 0)
+    return nullptr;
+  if (bases.size() > 1)
+    pybind11_fail("pybind11::detail::get_type_info: type has multiple "
+                  "pybind11-registered bases");
+  return bases.front();
 }
 
 inline detail::type_info *get_local_type_info(const std::type_index &tp) {
-    auto &locals = registered_local_types_cpp();
-    auto it = locals.find(tp);
-    if (it != locals.end())
-        return it->second;
-    return nullptr;
+  auto &locals = registered_local_types_cpp();
+  auto it = locals.find(tp);
+  if (it != locals.end())
+    return it->second;
+  return nullptr;
 }
 
 inline detail::type_info *get_global_type_info(const std::type_index &tp) {
-    auto &types = get_internals().registered_types_cpp;
-    auto it = types.find(tp);
-    if (it != types.end())
-        return it->second;
-    return nullptr;
+  auto &types = get_internals().registered_types_cpp;
+  auto it = types.find(tp);
+  if (it != types.end())
+    return it->second;
+  return nullptr;
 }
 
-/// Return the type info for a given C++ type; on lookup failure can either throw or return nullptr.
-PYBIND11_NOINLINE inline detail::type_info *get_type_info(const std::type_index &tp,
-                                                          bool throw_if_missing = false) {
-    if (auto ltype = get_local_type_info(tp))
-        return ltype;
-    if (auto gtype = get_global_type_info(tp))
-        return gtype;
+/// Return the type info for a given C++ type; on lookup failure can either
+/// throw or return nullptr.
+PYBIND11_NOINLINE inline detail::type_info *
+get_type_info(const std::type_index &tp, bool throw_if_missing = false) {
+  if (auto ltype = get_local_type_info(tp))
+    return ltype;
+  if (auto gtype = get_global_type_info(tp))
+    return gtype;
 
-    if (throw_if_missing) {
-        std::string tname = tp.name();
-        detail::clean_type_id(tname);
-        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \"" + tname + "\"");
-    }
-    return nullptr;
+  if (throw_if_missing) {
+    std::string tname = tp.name();
+    detail::clean_type_id(tname);
+    pybind11_fail(
+        "pybind11::detail::get_type_info: unable to find type info for \"" +
+        tname + "\"");
+  }
+  return nullptr;
 }
 
-PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
-    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
-    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
+PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp,
+                                                bool throw_if_missing) {
+  detail::type_info *type_info = get_type_info(tp, throw_if_missing);
+  return handle(type_info ? ((PyObject *)type_info->type) : nullptr);
 }
 
 struct value_and_holder {
-    instance *inst = nullptr;
-    size_t index = 0u;
-    const detail::type_info *type = nullptr;
-    void **vh = nullptr;
+  instance *inst = nullptr;
+  size_t index = 0u;
+  const detail::type_info *type = nullptr;
+  void **vh = nullptr;
 
-    // Main constructor for a found value/holder:
-    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) :
-        inst{i}, index{index}, type{type},
-        vh{inst->simple_layout ? inst->simple_value_holder : &inst->nonsimple.values_and_holders[vpos]}
-    {}
+  // Main constructor for a found value/holder:
+  value_and_holder(instance *i, const detail::type_info *type, size_t vpos,
+                   size_t index)
+      : inst{i}, index{index}, type{type},
+        vh{inst->simple_layout ? inst->simple_value_holder
+                               : &inst->nonsimple.values_and_holders[vpos]} {}
 
-    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
-    value_and_holder() {}
+  // Default constructor (used to signal a value-and-holder not found by
+  // get_value_and_holder())
+  value_and_holder() {}
 
-    // Used for past-the-end iterator
-    value_and_holder(size_t index) : index{index} {}
+  // Used for past-the-end iterator
+  value_and_holder(size_t index) : index{index} {}
 
-    template <typename V = void> V *&value_ptr() const {
-        return reinterpret_cast<V *&>(vh[0]);
-    }
-    // True if this `value_and_holder` has a non-null value pointer
-    explicit operator bool() const { return value_ptr(); }
+  template <typename V = void> V *&value_ptr() const {
+    return reinterpret_cast<V *&>(vh[0]);
+  }
+  // True if this `value_and_holder` has a non-null value pointer
+  explicit operator bool() const { return value_ptr(); }
 
-    template <typename H> H &holder() const {
-        return reinterpret_cast<H &>(vh[1]);
-    }
-    bool holder_constructed() const {
-        return inst->simple_layout
-            ? inst->simple_holder_constructed
-            : inst->nonsimple.status[index] & instance::status_holder_constructed;
-    }
-    void set_holder_constructed(bool v = true) {
-        if (inst->simple_layout)
-            inst->simple_holder_constructed = v;
-        else if (v)
-            inst->nonsimple.status[index] |= instance::status_holder_constructed;
-        else
-            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_holder_constructed;
-    }
-    bool instance_registered() const {
-        return inst->simple_layout
-            ? inst->simple_instance_registered
-            : inst->nonsimple.status[index] & instance::status_instance_registered;
-    }
-    void set_instance_registered(bool v = true) {
-        if (inst->simple_layout)
-            inst->simple_instance_registered = v;
-        else if (v)
-            inst->nonsimple.status[index] |= instance::status_instance_registered;
-        else
-            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_instance_registered;
-    }
+  template <typename H> H &holder() const {
+    return reinterpret_cast<H &>(vh[1]);
+  }
+  bool holder_constructed() const {
+    return inst->simple_layout ? inst->simple_holder_constructed
+                               : inst->nonsimple.status[index] &
+                                     instance::status_holder_constructed;
+  }
+  void set_holder_constructed(bool v = true) {
+    if (inst->simple_layout)
+      inst->simple_holder_constructed = v;
+    else if (v)
+      inst->nonsimple.status[index] |= instance::status_holder_constructed;
+    else
+      inst->nonsimple.status[index] &=
+          (uint8_t)~instance::status_holder_constructed;
+  }
+  bool instance_registered() const {
+    return inst->simple_layout ? inst->simple_instance_registered
+                               : inst->nonsimple.status[index] &
+                                     instance::status_instance_registered;
+  }
+  void set_instance_registered(bool v = true) {
+    if (inst->simple_layout)
+      inst->simple_instance_registered = v;
+    else if (v)
+      inst->nonsimple.status[index] |= instance::status_instance_registered;
+    else
+      inst->nonsimple.status[index] &=
+          (uint8_t)~instance::status_instance_registered;
+  }
 };
 
 // Container for accessing and iterating over an instance's values/holders
 struct values_and_holders {
 private:
-    instance *inst;
-    using type_vec = std::vector<detail::type_info *>;
-    const type_vec &tinfo;
+  instance *inst;
+  using type_vec = std::vector<detail::type_info *>;
+  const type_vec &tinfo;
 
 public:
-    values_and_holders(instance *inst) : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+  values_and_holders(instance *inst)
+      : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
 
-    struct iterator {
-    private:
-        instance *inst = nullptr;
-        const type_vec *types = nullptr;
-        value_and_holder curr;
-        friend struct values_and_holders;
-        iterator(instance *inst, const type_vec *tinfo)
-            : inst{inst}, types{tinfo},
-            curr(inst /* instance */,
-                 types->empty() ? nullptr : (*types)[0] /* type info */,
-                 0, /* vpos: (non-simple types only): the first vptr comes first */
-                 0 /* index */)
-        {}
-        // Past-the-end iterator:
-        iterator(size_t end) : curr(end) {}
-    public:
-        bool operator==(const iterator &other) { return curr.index == other.curr.index; }
-        bool operator!=(const iterator &other) { return curr.index != other.curr.index; }
-        iterator &operator++() {
-            if (!inst->simple_layout)
-                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
-            ++curr.index;
-            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
-            return *this;
-        }
-        value_and_holder &operator*() { return curr; }
-        value_and_holder *operator->() { return &curr; }
-    };
+  struct iterator {
+  private:
+    instance *inst = nullptr;
+    const type_vec *types = nullptr;
+    value_and_holder curr;
+    friend struct values_and_holders;
+    iterator(instance *inst, const type_vec *tinfo)
+        : inst{inst}, types{tinfo},
+          curr(
+              inst /* instance */,
+              types->empty() ? nullptr : (*types)[0] /* type info */,
+              0, /* vpos: (non-simple types only): the first vptr comes first */
+              0 /* index */) {}
+    // Past-the-end iterator:
+    iterator(size_t end) : curr(end) {}
 
-    iterator begin() { return iterator(inst, &tinfo); }
-    iterator end() { return iterator(tinfo.size()); }
-
-    iterator find(const type_info *find_type) {
-        auto it = begin(), endit = end();
-        while (it != endit && it->type != find_type) ++it;
-        return it;
+  public:
+    bool operator==(const iterator &other) {
+      return curr.index == other.curr.index;
     }
+    bool operator!=(const iterator &other) {
+      return curr.index != other.curr.index;
+    }
+    iterator &operator++() {
+      if (!inst->simple_layout)
+        curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+      ++curr.index;
+      curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+      return *this;
+    }
+    value_and_holder &operator*() { return curr; }
+    value_and_holder *operator->() { return &curr; }
+  };
 
-    size_t size() { return tinfo.size(); }
+  iterator begin() { return iterator(inst, &tinfo); }
+  iterator end() { return iterator(tinfo.size()); }
+
+  iterator find(const type_info *find_type) {
+    auto it = begin(), endit = end();
+    while (it != endit && it->type != find_type)
+      ++it;
+    return it;
+  }
+
+  size_t size() { return tinfo.size(); }
 };
 
 /**
- * Extracts C++ value and holder pointer references from an instance (which may contain multiple
- * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
- * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
- * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
- * regardless of type (and the resulting .type will be nullptr).
+ * Extracts C++ value and holder pointer references from an instance (which may
+ * contain multiple values/holders for python-side multiple inheritance) that
+ * match the given type.  Throws an error if the given type (or ValueType, if
+ * omitted) is not a pybind11 base of the given instance.  If `find_type` is
+ * omitted (or explicitly specified as nullptr) the first value/holder are
+ * returned, regardless of type (and the resulting .type will be nullptr).
  *
- * The returned object should be short-lived: in particular, it must not outlive the called-upon
- * instance.
+ * The returned object should be short-lived: in particular, it must not outlive
+ * the called-upon instance.
  */
-PYBIND11_NOINLINE inline value_and_holder instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/, bool throw_if_missing /*= true in common.h*/) {
-    // Optimize common case:
-    if (!find_type || Py_TYPE(this) == find_type->type)
-        return value_and_holder(this, find_type, 0, 0);
+PYBIND11_NOINLINE inline value_and_holder instance::get_value_and_holder(
+    const type_info *find_type /*= nullptr default in common.h*/,
+    bool throw_if_missing /*= true in common.h*/) {
+  // Optimize common case:
+  if (!find_type || Py_TYPE(this) == find_type->type)
+    return value_and_holder(this, find_type, 0, 0);
 
-    detail::values_and_holders vhs(this);
-    auto it = vhs.find(find_type);
-    if (it != vhs.end())
-        return *it;
+  detail::values_and_holders vhs(this);
+  auto it = vhs.find(find_type);
+  if (it != vhs.end())
+    return *it;
 
-    if (!throw_if_missing)
-        return value_and_holder();
+  if (!throw_if_missing)
+    return value_and_holder();
 
 #if defined(NDEBUG)
-    pybind11_fail("pybind11::detail::instance::get_value_and_holder: "
-            "type is not a pybind11 base of the given instance "
-            "(compile in debug mode for type details)");
+  pybind11_fail("pybind11::detail::instance::get_value_and_holder: "
+                "type is not a pybind11 base of the given instance "
+                "(compile in debug mode for type details)");
 #else
-    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" +
-            std::string(find_type->type->tp_name) + "' is not a pybind11 base of the given `" +
-            std::string(Py_TYPE(this)->tp_name) + "' instance");
+  pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" +
+                std::string(find_type->type->tp_name) +
+                "' is not a pybind11 base of the given `" +
+                std::string(Py_TYPE(this)->tp_name) + "' instance");
 #endif
 }
 
 PYBIND11_NOINLINE inline void instance::allocate_layout() {
-    auto &tinfo = all_type_info(Py_TYPE(this));
+  auto &tinfo = all_type_info(Py_TYPE(this));
 
-    const size_t n_types = tinfo.size();
+  const size_t n_types = tinfo.size();
 
-    if (n_types == 0)
-        pybind11_fail("instance allocation failed: new instance has no pybind11-registered base types");
+  if (n_types == 0)
+    pybind11_fail("instance allocation failed: new instance has no "
+                  "pybind11-registered base types");
 
-    simple_layout =
-        n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+  simple_layout = n_types == 1 && tinfo.front()->holder_size_in_ptrs <=
+                                      instance_simple_holder_in_ptrs();
 
-    // Simple path: no python-side multiple inheritance, and a small-enough holder
-    if (simple_layout) {
-        simple_value_holder[0] = nullptr;
-        simple_holder_constructed = false;
-        simple_instance_registered = false;
+  // Simple path: no python-side multiple inheritance, and a small-enough holder
+  if (simple_layout) {
+    simple_value_holder[0] = nullptr;
+    simple_holder_constructed = false;
+    simple_instance_registered = false;
+  } else { // multiple base types or a too-large holder
+    // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a
+    // value pointer, [hN] is the (uninitialized) holder instance for value N,
+    // and [bb...] is a set of bool values that tracks whether each associated
+    // holder has been initialized.  Each [block] is padded, if necessary, to an
+    // integer multiple of sizeof(void *).
+    size_t space = 0;
+    for (auto t : tinfo) {
+      space += 1;                      // value pointer
+      space += t->holder_size_in_ptrs; // holder instance
     }
-    else { // multiple base types or a too-large holder
-        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
-        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
-        // values that tracks whether each associated holder has been initialized.  Each [block] is
-        // padded, if necessary, to an integer multiple of sizeof(void *).
-        size_t space = 0;
-        for (auto t : tinfo) {
-            space += 1; // value pointer
-            space += t->holder_size_in_ptrs; // holder instance
-        }
-        size_t flags_at = space;
-        space += size_in_ptrs(n_types); // status bytes (holder_constructed and instance_registered)
+    size_t flags_at = space;
+    space += size_in_ptrs(
+        n_types); // status bytes (holder_constructed and instance_registered)
 
-        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
-        // in particular, need to be 0).  Use Python's memory allocation functions: in Python 3.6
-        // they default to using pymalloc, which is designed to be efficient for small allocations
-        // like the one we're doing here; in earlier versions (and for larger allocations) they are
-        // just wrappers around malloc.
+    // Allocate space for flags, values, and holders, and initialize it to 0
+    // (flags and values, in particular, need to be 0).  Use Python's memory
+    // allocation functions: in Python 3.6 they default to using pymalloc, which
+    // is designed to be efficient for small allocations like the one we're
+    // doing here; in earlier versions (and for larger allocations) they are
+    // just wrappers around malloc.
 #if PY_VERSION_HEX >= 0x03050000
-        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
-        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+    nonsimple.values_and_holders = (void **)PyMem_Calloc(space, sizeof(void *));
+    if (!nonsimple.values_and_holders)
+      throw std::bad_alloc();
 #else
-        nonsimple.values_and_holders = (void **) PyMem_New(void *, space);
-        if (!nonsimple.values_and_holders) throw std::bad_alloc();
-        std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *));
+    nonsimple.values_and_holders = (void **)PyMem_New(void *, space);
+    if (!nonsimple.values_and_holders)
+      throw std::bad_alloc();
+    std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *));
 #endif
-        nonsimple.status = reinterpret_cast<uint8_t *>(&nonsimple.values_and_holders[flags_at]);
-    }
-    owned = true;
+    nonsimple.status =
+        reinterpret_cast<uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+  }
+  owned = true;
 }
 
 PYBIND11_NOINLINE inline void instance::deallocate_layout() {
-    if (!simple_layout)
-        PyMem_Free(nonsimple.values_and_holders);
+  if (!simple_layout)
+    PyMem_Free(nonsimple.values_and_holders);
 }
 
-PYBIND11_NOINLINE inline bool isinstance_generic(handle obj, const std::type_info &tp) {
-    handle type = detail::get_type_handle(tp, false);
-    if (!type)
-        return false;
-    return isinstance(obj, type);
+PYBIND11_NOINLINE inline bool isinstance_generic(handle obj,
+                                                 const std::type_info &tp) {
+  handle type = detail::get_type_handle(tp, false);
+  if (!type)
+    return false;
+  return isinstance(obj, type);
 }
 
 PYBIND11_NOINLINE inline std::string error_string() {
-    if (!PyErr_Occurred()) {
-        PyErr_SetString(PyExc_RuntimeError, "Unknown internal error occurred");
-        return "Unknown internal error occurred";
-    }
+  if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_RuntimeError, "Unknown internal error occurred");
+    return "Unknown internal error occurred";
+  }
 
-    error_scope scope; // Preserve error state
+  error_scope scope; // Preserve error state
 
-    std::string errorString;
-    if (scope.type) {
-        errorString += handle(scope.type).attr("__name__").cast<std::string>();
-        errorString += ": ";
-    }
-    if (scope.value)
-        errorString += (std::string) str(scope.value);
+  std::string errorString;
+  if (scope.type) {
+    errorString += handle(scope.type).attr("__name__").cast<std::string>();
+    errorString += ": ";
+  }
+  if (scope.value)
+    errorString += (std::string)str(scope.value);
 
-    PyErr_NormalizeException(&scope.type, &scope.value, &scope.trace);
+  PyErr_NormalizeException(&scope.type, &scope.value, &scope.trace);
 
 #if PY_MAJOR_VERSION >= 3
-    if (scope.trace != nullptr)
-        PyException_SetTraceback(scope.value, scope.trace);
+  if (scope.trace != nullptr)
+    PyException_SetTraceback(scope.value, scope.trace);
 #endif
 
 #if !defined(PYPY_VERSION)
-    if (scope.trace) {
-        PyTracebackObject *trace = (PyTracebackObject *) scope.trace;
+  if (scope.trace) {
+    PyTracebackObject *trace = (PyTracebackObject *)scope.trace;
 
-        /* Get the deepest trace possible */
-        while (trace->tb_next)
-            trace = trace->tb_next;
+    /* Get the deepest trace possible */
+    while (trace->tb_next)
+      trace = trace->tb_next;
 
-        PyFrameObject *frame = trace->tb_frame;
-        errorString += "\n\nAt:\n";
-        while (frame) {
-            int lineno = PyFrame_GetLineNumber(frame);
-            errorString +=
-                "  " + handle(frame->f_code->co_filename).cast<std::string>() +
-                "(" + std::to_string(lineno) + "): " +
-                handle(frame->f_code->co_name).cast<std::string>() + "\n";
-            frame = frame->f_back;
-        }
+    PyFrameObject *frame = trace->tb_frame;
+    errorString += "\n\nAt:\n";
+    while (frame) {
+      int lineno = PyFrame_GetLineNumber(frame);
+      errorString +=
+          "  " + handle(frame->f_code->co_filename).cast<std::string>() + "(" +
+          std::to_string(lineno) +
+          "): " + handle(frame->f_code->co_name).cast<std::string>() + "\n";
+      frame = frame->f_back;
     }
+  }
 #endif
 
-    return errorString;
+  return errorString;
 }
 
-PYBIND11_NOINLINE inline handle get_object_handle(const void *ptr, const detail::type_info *type ) {
-    auto &instances = get_internals().registered_instances;
-    auto range = instances.equal_range(ptr);
-    for (auto it = range.first; it != range.second; ++it) {
-        for (auto vh : values_and_holders(it->second)) {
-            if (vh.type == type)
-                return handle((PyObject *) it->second);
-        }
+PYBIND11_NOINLINE inline handle
+get_object_handle(const void *ptr, const detail::type_info *type) {
+  auto &instances = get_internals().registered_instances;
+  auto range = instances.equal_range(ptr);
+  for (auto it = range.first; it != range.second; ++it) {
+    for (auto vh : values_and_holders(it->second)) {
+      if (vh.type == type)
+        return handle((PyObject *)it->second);
     }
-    return handle();
+  }
+  return handle();
 }
 
 inline PyThreadState *get_thread_state_unchecked() {
 #if defined(PYPY_VERSION)
-    return PyThreadState_GET();
+  return PyThreadState_GET();
 #elif PY_VERSION_HEX < 0x03000000
-    return _PyThreadState_Current;
+  return _PyThreadState_Current;
 #elif PY_VERSION_HEX < 0x03050000
-    return (PyThreadState*) _Py_atomic_load_relaxed(&_PyThreadState_Current);
+  return (PyThreadState *)_Py_atomic_load_relaxed(&_PyThreadState_Current);
 #elif PY_VERSION_HEX < 0x03050200
-    return (PyThreadState*) _PyThreadState_Current.value;
+  return (PyThreadState *)_PyThreadState_Current.value;
 #else
-    return _PyThreadState_UncheckedGet();
+  return _PyThreadState_UncheckedGet();
 #endif
 }
 
@@ -482,1100 +524,1283 @@ inline PyObject *make_new_instance(PyTypeObject *type);
 
 class type_caster_generic {
 public:
-    PYBIND11_NOINLINE type_caster_generic(const std::type_info &type_info)
-        : typeinfo(get_type_info(type_info)), cpptype(&type_info) { }
+  PYBIND11_NOINLINE type_caster_generic(const std::type_info &type_info)
+      : typeinfo(get_type_info(type_info)), cpptype(&type_info) {}
 
-    type_caster_generic(const type_info *typeinfo)
-        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) { }
+  type_caster_generic(const type_info *typeinfo)
+      : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) {}
 
-    bool load(handle src, bool convert) {
-        return load_impl<type_caster_generic>(src, convert);
+  bool load(handle src, bool convert) {
+    return load_impl<type_caster_generic>(src, convert);
+  }
+
+  PYBIND11_NOINLINE static handle
+  cast(const void *_src, return_value_policy policy, handle parent,
+       const detail::type_info *tinfo, void *(*copy_constructor)(const void *),
+       void *(*move_constructor)(const void *),
+       const void *existing_holder = nullptr) {
+    if (!tinfo) // no type info: error will be set already
+      return handle();
+
+    void *src = const_cast<void *>(_src);
+    if (src == nullptr)
+      return none().release();
+
+    auto it_instances = get_internals().registered_instances.equal_range(src);
+    for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
+      for (auto instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+        if (instance_type &&
+            same_type(*instance_type->cpptype, *tinfo->cpptype))
+          return handle((PyObject *)it_i->second).inc_ref();
+      }
     }
 
-    PYBIND11_NOINLINE static handle cast(const void *_src, return_value_policy policy, handle parent,
-                                         const detail::type_info *tinfo,
-                                         void *(*copy_constructor)(const void *),
-                                         void *(*move_constructor)(const void *),
-                                         const void *existing_holder = nullptr) {
-        if (!tinfo) // no type info: error will be set already
-            return handle();
+    auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+    auto wrapper = reinterpret_cast<instance *>(inst.ptr());
+    wrapper->owned = false;
+    void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
 
-        void *src = const_cast<void *>(_src);
-        if (src == nullptr)
-            return none().release();
+    switch (policy) {
+    case return_value_policy::automatic:
+    case return_value_policy::take_ownership:
+      valueptr = src;
+      wrapper->owned = true;
+      break;
 
-        auto it_instances = get_internals().registered_instances.equal_range(src);
-        for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
-            for (auto instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
-                if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype))
-                    return handle((PyObject *) it_i->second).inc_ref();
-            }
-        }
+    case return_value_policy::automatic_reference:
+    case return_value_policy::reference:
+      valueptr = src;
+      wrapper->owned = false;
+      break;
 
-        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
-        auto wrapper = reinterpret_cast<instance *>(inst.ptr());
-        wrapper->owned = false;
-        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
+    case return_value_policy::copy:
+      if (copy_constructor)
+        valueptr = copy_constructor(src);
+      else
+        throw cast_error("return_value_policy = copy, but the "
+                         "object is non-copyable!");
+      wrapper->owned = true;
+      break;
 
-        switch (policy) {
-            case return_value_policy::automatic:
-            case return_value_policy::take_ownership:
-                valueptr = src;
-                wrapper->owned = true;
-                break;
+    case return_value_policy::move:
+      if (move_constructor)
+        valueptr = move_constructor(src);
+      else if (copy_constructor)
+        valueptr = copy_constructor(src);
+      else
+        throw cast_error("return_value_policy = move, but the "
+                         "object is neither movable nor copyable!");
+      wrapper->owned = true;
+      break;
 
-            case return_value_policy::automatic_reference:
-            case return_value_policy::reference:
-                valueptr = src;
-                wrapper->owned = false;
-                break;
+    case return_value_policy::reference_internal:
+      valueptr = src;
+      wrapper->owned = false;
+      keep_alive_impl(inst, parent);
+      break;
 
-            case return_value_policy::copy:
-                if (copy_constructor)
-                    valueptr = copy_constructor(src);
-                else
-                    throw cast_error("return_value_policy = copy, but the "
-                                     "object is non-copyable!");
-                wrapper->owned = true;
-                break;
-
-            case return_value_policy::move:
-                if (move_constructor)
-                    valueptr = move_constructor(src);
-                else if (copy_constructor)
-                    valueptr = copy_constructor(src);
-                else
-                    throw cast_error("return_value_policy = move, but the "
-                                     "object is neither movable nor copyable!");
-                wrapper->owned = true;
-                break;
-
-            case return_value_policy::reference_internal:
-                valueptr = src;
-                wrapper->owned = false;
-                keep_alive_impl(inst, parent);
-                break;
-
-            default:
-                throw cast_error("unhandled return_value_policy: should not happen!");
-        }
-
-        tinfo->init_instance(wrapper, existing_holder);
-
-        return inst.release();
+    default:
+      throw cast_error("unhandled return_value_policy: should not happen!");
     }
 
-    // Base methods for generic caster; there are overridden in copyable_holder_caster
-    void load_value(value_and_holder &&v_h) {
-        auto *&vptr = v_h.value_ptr();
-        // Lazy allocation for unallocated values:
-        if (vptr == nullptr) {
-            auto *type = v_h.type ? v_h.type : typeinfo;
-            if (type->operator_new) {
-                vptr = type->operator_new(type->type_size);
-            } else {
-                #if defined(PYBIND11_CPP17)
-                    if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
-                        vptr = ::operator new(type->type_size,
-                                              (std::align_val_t) type->type_align);
-                    else
-                #endif
-                vptr = ::operator new(type->type_size);
-            }
-        }
-        value = vptr;
+    tinfo->init_instance(wrapper, existing_holder);
+
+    return inst.release();
+  }
+
+  // Base methods for generic caster; there are overridden in
+  // copyable_holder_caster
+  void load_value(value_and_holder &&v_h) {
+    auto *&vptr = v_h.value_ptr();
+    // Lazy allocation for unallocated values:
+    if (vptr == nullptr) {
+      auto *type = v_h.type ? v_h.type : typeinfo;
+      if (type->operator_new) {
+        vptr = type->operator_new(type->type_size);
+      } else {
+#if defined(PYBIND11_CPP17)
+        if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
+          vptr = ::operator new(type->type_size,
+                                (std::align_val_t)type->type_align);
+        else
+#endif
+          vptr = ::operator new(type->type_size);
+      }
     }
-    bool try_implicit_casts(handle src, bool convert) {
-        for (auto &cast : typeinfo->implicit_casts) {
-            type_caster_generic sub_caster(*cast.first);
-            if (sub_caster.load(src, convert)) {
-                value = cast.second(sub_caster.value);
-                return true;
-            }
-        }
+    value = vptr;
+  }
+  bool try_implicit_casts(handle src, bool convert) {
+    for (auto &cast : typeinfo->implicit_casts) {
+      type_caster_generic sub_caster(*cast.first);
+      if (sub_caster.load(src, convert)) {
+        value = cast.second(sub_caster.value);
+        return true;
+      }
+    }
+    return false;
+  }
+  bool try_direct_conversions(handle src) {
+    for (auto &converter : *typeinfo->direct_conversions) {
+      if (converter(src.ptr(), value))
+        return true;
+    }
+    return false;
+  }
+  void check_holder_compat() {}
+
+  PYBIND11_NOINLINE static void *local_load(PyObject *src,
+                                            const type_info *ti) {
+    auto caster = type_caster_generic(ti);
+    if (caster.load(src, false))
+      return caster.value;
+    return nullptr;
+  }
+
+  /// Try to load with foreign typeinfo, if available. Used when there is no
+  /// native typeinfo, or when the native one wasn't able to produce a value.
+  PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+    constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+    const auto pytype = src.get_type();
+    if (!hasattr(pytype, local_key))
+      return false;
+
+    type_info *foreign_typeinfo =
+        reinterpret_borrow<capsule>(getattr(pytype, local_key));
+    // Only consider this foreign loader if actually foreign and is a loader of
+    // the correct cpp type
+    if (foreign_typeinfo->module_local_load == &local_load ||
+        (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype)))
+      return false;
+
+    if (auto result =
+            foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+      value = result;
+      return true;
+    }
+    return false;
+  }
+
+  // Implementation of `load`; this takes the type of `this` so that it can
+  // dispatch the relevant bits of code between here and copyable_holder_caster
+  // where the two classes need different logic (without having to resort to
+  // virtual inheritance).
+  template <typename ThisT>
+  PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+    if (!src)
+      return false;
+    if (!typeinfo)
+      return try_load_foreign_module_local(src);
+    if (src.is_none()) {
+      // Defer accepting None to other overloads (if we aren't in convert mode):
+      if (!convert)
         return false;
-    }
-    bool try_direct_conversions(handle src) {
-        for (auto &converter : *typeinfo->direct_conversions) {
-            if (converter(src.ptr(), value))
-                return true;
-        }
-        return false;
-    }
-    void check_holder_compat() {}
-
-    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
-        auto caster = type_caster_generic(ti);
-        if (caster.load(src, false))
-            return caster.value;
-        return nullptr;
+      value = nullptr;
+      return true;
     }
 
-    /// Try to load with foreign typeinfo, if available. Used when there is no
-    /// native typeinfo, or when the native one wasn't able to produce a value.
-    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
-        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
-        const auto pytype = src.get_type();
-        if (!hasattr(pytype, local_key))
-            return false;
+    auto &this_ = static_cast<ThisT &>(*this);
+    this_.check_holder_compat();
 
-        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
-        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp type
-        if (foreign_typeinfo->module_local_load == &local_load
-            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype)))
-            return false;
+    PyTypeObject *srctype = Py_TYPE(src.ptr());
 
-        if (auto result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
-            value = result;
+    // Case 1: If src is an exact type match for the target type then we can
+    // reinterpret_cast the instance's value pointer to the target type:
+    if (srctype == typeinfo->type) {
+      this_.load_value(
+          reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+      return true;
+    }
+    // Case 2: We have a derived class
+    else if (PyType_IsSubtype(srctype, typeinfo->type)) {
+      auto &bases = all_type_info(srctype);
+      bool no_cpp_mi = typeinfo->simple_type;
+
+      // Case 2a: the python type is a Python-inherited derived class that
+      // inherits from just one simple (no MI) pybind11 class, or is an exact
+      // match, so the C++ instance is of the right type and we can use
+      // reinterpret_cast. (This is essentially the same as case 2b, but because
+      // not using multiple inheritance is extremely common, we handle it
+      // specially to avoid the loop iterator and type pointer lookup overhead)
+      if (bases.size() == 1 &&
+          (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+        this_.load_value(
+            reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+        return true;
+      }
+      // Case 2b: the python type inherits from multiple C++ bases.  Check the
+      // bases to see if we can find an exact match (or, for a simple C++ type,
+      // an inherited match); if so, we can safely reinterpret_cast to the
+      // relevant pointer.
+      else if (bases.size() > 1) {
+        for (auto base : bases) {
+          if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type)
+                        : base->type == typeinfo->type) {
+            this_.load_value(
+                reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(
+                    base));
             return true;
+          }
         }
-        return false;
+      }
+
+      // Case 2c: C++ multiple inheritance is involved and we couldn't find an
+      // exact type match in the registered bases, above, so try implicit
+      // casting (needed for proper C++ casting when MI is involved).
+      if (this_.try_implicit_casts(src, convert))
+        return true;
     }
 
-    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
-    // bits of code between here and copyable_holder_caster where the two classes need different
-    // logic (without having to resort to virtual inheritance).
-    template <typename ThisT>
-    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
-        if (!src) return false;
-        if (!typeinfo) return try_load_foreign_module_local(src);
-        if (src.is_none()) {
-            // Defer accepting None to other overloads (if we aren't in convert mode):
-            if (!convert) return false;
-            value = nullptr;
-            return true;
+    // Perform an implicit conversion
+    if (convert) {
+      for (auto &converter : typeinfo->implicit_conversions) {
+        auto temp =
+            reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+        if (load_impl<ThisT>(temp, false)) {
+          loader_life_support::add_patient(temp);
+          return true;
         }
-
-        auto &this_ = static_cast<ThisT &>(*this);
-        this_.check_holder_compat();
-
-        PyTypeObject *srctype = Py_TYPE(src.ptr());
-
-        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
-        // the instance's value pointer to the target type:
-        if (srctype == typeinfo->type) {
-            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
-            return true;
-        }
-        // Case 2: We have a derived class
-        else if (PyType_IsSubtype(srctype, typeinfo->type)) {
-            auto &bases = all_type_info(srctype);
-            bool no_cpp_mi = typeinfo->simple_type;
-
-            // Case 2a: the python type is a Python-inherited derived class that inherits from just
-            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
-            // the right type and we can use reinterpret_cast.
-            // (This is essentially the same as case 2b, but because not using multiple inheritance
-            // is extremely common, we handle it specially to avoid the loop iterator and type
-            // pointer lookup overhead)
-            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
-                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
-                return true;
-            }
-            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see if
-            // we can find an exact match (or, for a simple C++ type, an inherited match); if so, we
-            // can safely reinterpret_cast to the relevant pointer.
-            else if (bases.size() > 1) {
-                for (auto base : bases) {
-                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type) : base->type == typeinfo->type) {
-                        this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
-                        return true;
-                    }
-                }
-            }
-
-            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type match
-            // in the registered bases, above, so try implicit casting (needed for proper C++ casting
-            // when MI is involved).
-            if (this_.try_implicit_casts(src, convert))
-                return true;
-        }
-
-        // Perform an implicit conversion
-        if (convert) {
-            for (auto &converter : typeinfo->implicit_conversions) {
-                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
-                if (load_impl<ThisT>(temp, false)) {
-                    loader_life_support::add_patient(temp);
-                    return true;
-                }
-            }
-            if (this_.try_direct_conversions(src))
-                return true;
-        }
-
-        // Failed to match local typeinfo. Try again with global.
-        if (typeinfo->module_local) {
-            if (auto gtype = get_global_type_info(*typeinfo->cpptype)) {
-                typeinfo = gtype;
-                return load(src, false);
-            }
-        }
-
-        // Global typeinfo has precedence over foreign module_local
-        return try_load_foreign_module_local(src);
+      }
+      if (this_.try_direct_conversions(src))
+        return true;
     }
 
-
-    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
-    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
-    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
-    PYBIND11_NOINLINE static std::pair<const void *, const type_info *> src_and_type(
-            const void *src, const std::type_info &cast_type, const std::type_info *rtti_type = nullptr) {
-        if (auto *tpi = get_type_info(cast_type))
-            return {src, const_cast<const type_info *>(tpi)};
-
-        // Not found, set error:
-        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
-        detail::clean_type_id(tname);
-        std::string msg = "Unregistered type : " + tname;
-        PyErr_SetString(PyExc_TypeError, msg.c_str());
-        return {nullptr, nullptr};
+    // Failed to match local typeinfo. Try again with global.
+    if (typeinfo->module_local) {
+      if (auto gtype = get_global_type_info(*typeinfo->cpptype)) {
+        typeinfo = gtype;
+        return load(src, false);
+      }
     }
 
-    const type_info *typeinfo = nullptr;
-    const std::type_info *cpptype = nullptr;
-    void *value = nullptr;
+    // Global typeinfo has precedence over foreign module_local
+    return try_load_foreign_module_local(src);
+  }
+
+  // Called to do type lookup and wrap the pointer and type in a pair when a
+  // dynamic_cast isn't needed or can't be used.  If the type is unknown, sets
+  // the error and returns a pair with .second = nullptr.  (p.first = nullptr is
+  // not an error: it becomes None).
+  PYBIND11_NOINLINE static std::pair<const void *, const type_info *>
+  src_and_type(const void *src, const std::type_info &cast_type,
+               const std::type_info *rtti_type = nullptr) {
+    if (auto *tpi = get_type_info(cast_type))
+      return {src, const_cast<const type_info *>(tpi)};
+
+    // Not found, set error:
+    std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+    detail::clean_type_id(tname);
+    std::string msg = "Unregistered type : " + tname;
+    PyErr_SetString(PyExc_TypeError, msg.c_str());
+    return {nullptr, nullptr};
+  }
+
+  const type_info *typeinfo = nullptr;
+  const std::type_info *cpptype = nullptr;
+  void *value = nullptr;
 };
 
 /**
- * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
- * needs to provide `operator T*()` and `operator T&()` operators.
+ * Determine suitable casting operator for pointer-or-lvalue-casting type
+ * casters.  The type caster needs to provide `operator T*()` and `operator
+ * T&()` operators.
  *
- * If the type supports moving the value away via an `operator T&&() &&` method, it should use
- * `movable_cast_op_type` instead.
+ * If the type supports moving the value away via an `operator T&&() &&` method,
+ * it should use `movable_cast_op_type` instead.
  */
 template <typename T>
 using cast_op_type =
     conditional_t<std::is_pointer<remove_reference_t<T>>::value,
-        typename std::add_pointer<intrinsic_t<T>>::type,
-        typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+                  typename std::add_pointer<intrinsic_t<T>>::type,
+                  typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
 
 /**
- * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
- * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
- * called in appropriate contexts where the value can be moved rather than copied.
+ * Determine suitable casting operator for a type caster with a movable value.
+ * Such a type caster needs to provide `operator T*()`, `operator T&()`, and
+ * `operator T&&() &&`.  The latter will be called in appropriate contexts where
+ * the value can be moved rather than copied.
  *
- * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER
+ * macro.
  */
 template <typename T>
-using movable_cast_op_type =
-    conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
-        typename std::add_pointer<intrinsic_t<T>>::type,
+using movable_cast_op_type = conditional_t<
+    std::is_pointer<typename std::remove_reference<T>::type>::value,
+    typename std::add_pointer<intrinsic_t<T>>::type,
     conditional_t<std::is_rvalue_reference<T>::value,
-        typename std::add_rvalue_reference<intrinsic_t<T>>::type,
-        typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
+                  typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+                  typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
 
-// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
-// T is non-copyable, but code containing such a copy constructor fails to actually compile.
-template <typename T, typename SFINAE = void> struct is_copy_constructible : std::is_copy_constructible<T> {};
+// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and
+// similar) through when T is non-copyable, but code containing such a copy
+// constructor fails to actually compile.
+template <typename T, typename SFINAE = void>
+struct is_copy_constructible : std::is_copy_constructible<T> {};
 
-// Specialization for types that appear to be copy constructible but also look like stl containers
-// (we specifically check for: has `value_type` and `reference` with `reference = value_type&`): if
-// so, copy constructability depends on whether the value_type is copy constructible.
-template <typename Container> struct is_copy_constructible<Container, enable_if_t<all_of<
-        std::is_copy_constructible<Container>,
-        std::is_same<typename Container::value_type &, typename Container::reference>
-    >::value>> : is_copy_constructible<typename Container::value_type> {};
+// Specialization for types that appear to be copy constructible but also look
+// like stl containers (we specifically check for: has `value_type` and
+// `reference` with `reference = value_type&`): if so, copy constructability
+// depends on whether the value_type is copy constructible.
+template <typename Container>
+struct is_copy_constructible<
+    Container,
+    enable_if_t<all_of<std::is_copy_constructible<Container>,
+                       std::is_same<typename Container::value_type &,
+                                    typename Container::reference>>::value>>
+    : is_copy_constructible<typename Container::value_type> {};
 
 #if !defined(PYBIND11_CPP17)
-// Likewise for std::pair before C++17 (which mandates that the copy constructor not exist when the
-// two types aren't themselves copy constructible).
-template <typename T1, typename T2> struct is_copy_constructible<std::pair<T1, T2>>
+// Likewise for std::pair before C++17 (which mandates that the copy constructor
+// not exist when the two types aren't themselves copy constructible).
+template <typename T1, typename T2>
+struct is_copy_constructible<std::pair<T1, T2>>
     : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
 #endif
 
 NAMESPACE_END(detail)
 
-// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
-// to by `src` actually is an instance of some class derived from `itype`.
-// If so, it sets `tinfo` to point to the std::type_info representing that derived
-// type, and returns a pointer to the start of the most-derived object of that type
-// (in which `src` is a subobject; this will be the same address as `src` in most
-// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
-// and leaves `tinfo` at its default value of nullptr.
+// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object
+// pointed to by `src` actually is an instance of some class derived from
+// `itype`. If so, it sets `tinfo` to point to the std::type_info representing
+// that derived type, and returns a pointer to the start of the most-derived
+// object of that type (in which `src` is a subobject; this will be the same
+// address as `src` in most single inheritance cases). If not, or if `src` is
+// nullptr, it simply returns `src` and leaves `tinfo` at its default value of
+// nullptr.
 //
-// The default polymorphic_type_hook just returns src. A specialization for polymorphic
-// types determines the runtime type of the passed object and adjusts the this-pointer
-// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
-// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
-// registered with pybind11, and this Animal is in fact a Dog).
+// The default polymorphic_type_hook just returns src. A specialization for
+// polymorphic types determines the runtime type of the passed object and
+// adjusts the this-pointer appropriately via dynamic_cast<void*>. This is what
+// enables a C++ Animal* to appear to Python as a Dog (if Dog inherits from
+// Animal, Animal is polymorphic, Dog is registered with pybind11, and this
+// Animal is in fact a Dog).
 //
-// You may specialize polymorphic_type_hook yourself for types that want to appear
-// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
-// in performance-sensitive applications, used most notably in LLVM.)
-template <typename itype, typename SFINAE = void>
-struct polymorphic_type_hook
-{
-    static const void *get(const itype *src, const std::type_info*&) { return src; }
+// You may specialize polymorphic_type_hook yourself for types that want to
+// appear polymorphic to Python but do not use C++ RTTI. (This is a not uncommon
+// pattern in performance-sensitive applications, used most notably in LLVM.)
+template <typename itype, typename SFINAE = void> struct polymorphic_type_hook {
+  static const void *get(const itype *src, const std::type_info *&) {
+    return src;
+  }
 };
 template <typename itype>
-struct polymorphic_type_hook<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>>
-{
-    static const void *get(const itype *src, const std::type_info*& type) {
-        type = src ? &typeid(*src) : nullptr;
-        return dynamic_cast<const void*>(src);
-    }
+struct polymorphic_type_hook<
+    itype, detail::enable_if_t<std::is_polymorphic<itype>::value>> {
+  static const void *get(const itype *src, const std::type_info *&type) {
+    type = src ? &typeid(*src) : nullptr;
+    return dynamic_cast<const void *>(src);
+  }
 };
 
 NAMESPACE_BEGIN(detail)
 
 /// Generic type caster for objects stored on the heap
 template <typename type> class type_caster_base : public type_caster_generic {
-    using itype = intrinsic_t<type>;
+  using itype = intrinsic_t<type>;
 
 public:
-    static constexpr auto name = _<type>();
+  static constexpr auto name = _<type>();
 
-    type_caster_base() : type_caster_base(typeid(type)) { }
-    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) { }
+  type_caster_base() : type_caster_base(typeid(type)) {}
+  explicit type_caster_base(const std::type_info &info)
+      : type_caster_generic(info) {}
 
-    static handle cast(const itype &src, return_value_policy policy, handle parent) {
-        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
-            policy = return_value_policy::copy;
-        return cast(&src, policy, parent);
+  static handle cast(const itype &src, return_value_policy policy,
+                     handle parent) {
+    if (policy == return_value_policy::automatic ||
+        policy == return_value_policy::automatic_reference)
+      policy = return_value_policy::copy;
+    return cast(&src, policy, parent);
+  }
+
+  static handle cast(itype &&src, return_value_policy, handle parent) {
+    return cast(&src, return_value_policy::move, parent);
+  }
+
+  // Returns a (pointer, type_info) pair taking care of necessary type lookup
+  // for a polymorphic type (using RTTI by default, but can be overridden by
+  // specializing polymorphic_type_hook). If the instance isn't derived, returns
+  // the base version.
+  static std::pair<const void *, const type_info *>
+  src_and_type(const itype *src) {
+    auto &cast_type = typeid(itype);
+    const std::type_info *instance_type = nullptr;
+    const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
+    if (instance_type && !same_type(cast_type, *instance_type)) {
+      // This is a base pointer to a derived type. If the derived type is
+      // registered with pybind11, we want to make the full derived object
+      // available. In the typical case where itype is polymorphic, we get the
+      // correct derived pointer (which may be != base pointer) by a
+      // dynamic_cast to most derived type. If itype is not polymorphic, we
+      // won't get here except via a user-provided specialization of
+      // polymorphic_type_hook, and the user has promised that no this-pointer
+      // adjustment is required in that case, so it's OK to use static_cast.
+      if (const auto *tpi = get_type_info(*instance_type))
+        return {vsrc, tpi};
     }
+    // Otherwise we have either a nullptr, an `itype` pointer, or an unknown
+    // derived pointer, so don't do a cast
+    return type_caster_generic::src_and_type(src, cast_type, instance_type);
+  }
 
-    static handle cast(itype &&src, return_value_policy, handle parent) {
-        return cast(&src, return_value_policy::move, parent);
-    }
+  static handle cast(const itype *src, return_value_policy policy,
+                     handle parent) {
+    auto st = src_and_type(src);
+    return type_caster_generic::cast(st.first, policy, parent, st.second,
+                                     make_copy_constructor(src),
+                                     make_move_constructor(src));
+  }
 
-    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
-    // polymorphic type (using RTTI by default, but can be overridden by specializing
-    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
-    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
-        auto &cast_type = typeid(itype);
-        const std::type_info *instance_type = nullptr;
-        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
-        if (instance_type && !same_type(cast_type, *instance_type)) {
-            // This is a base pointer to a derived type. If the derived type is registered
-            // with pybind11, we want to make the full derived object available.
-            // In the typical case where itype is polymorphic, we get the correct
-            // derived pointer (which may be != base pointer) by a dynamic_cast to
-            // most derived type. If itype is not polymorphic, we won't get here
-            // except via a user-provided specialization of polymorphic_type_hook,
-            // and the user has promised that no this-pointer adjustment is
-            // required in that case, so it's OK to use static_cast.
-            if (const auto *tpi = get_type_info(*instance_type))
-                return {vsrc, tpi};
-        }
-        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so
-        // don't do a cast
-        return type_caster_generic::src_and_type(src, cast_type, instance_type);
-    }
+  static handle cast_holder(const itype *src, const void *holder) {
+    auto st = src_and_type(src);
+    return type_caster_generic::cast(st.first,
+                                     return_value_policy::take_ownership, {},
+                                     st.second, nullptr, nullptr, holder);
+  }
 
-    static handle cast(const itype *src, return_value_policy policy, handle parent) {
-        auto st = src_and_type(src);
-        return type_caster_generic::cast(
-            st.first, policy, parent, st.second,
-            make_copy_constructor(src), make_move_constructor(src));
-    }
+  template <typename T> using cast_op_type = detail::cast_op_type<T>;
 
-    static handle cast_holder(const itype *src, const void *holder) {
-        auto st = src_and_type(src);
-        return type_caster_generic::cast(
-            st.first, return_value_policy::take_ownership, {}, st.second,
-            nullptr, nullptr, holder);
-    }
-
-    template <typename T> using cast_op_type = detail::cast_op_type<T>;
-
-    operator itype*() { return (type *) value; }
-    operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); }
+  operator itype *() { return (type *)value; }
+  operator itype &() {
+    if (!value)
+      throw reference_cast_error();
+    return *((itype *)value);
+  }
 
 protected:
-    using Constructor = void *(*)(const void *);
+  using Constructor = void *(*)(const void *);
 
-    /* Only enabled when the types are {copy,move}-constructible *and* when the type
-       does not have a private operator new implementation. */
-    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
-    static auto make_copy_constructor(const T *x) -> decltype(new T(*x), Constructor{}) {
-        return [](const void *arg) -> void * {
-            return new T(*reinterpret_cast<const T *>(arg));
-        };
-    }
+  /* Only enabled when the types are {copy,move}-constructible *and* when the
+     type does not have a private operator new implementation. */
+  template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+  static auto make_copy_constructor(const T *x)
+      -> decltype(new T(*x), Constructor{}) {
+    return [](const void *arg) -> void * {
+      return new T(*reinterpret_cast<const T *>(arg));
+    };
+  }
 
-    template <typename T, typename = enable_if_t<std::is_move_constructible<T>::value>>
-    static auto make_move_constructor(const T *x) -> decltype(new T(std::move(*const_cast<T *>(x))), Constructor{}) {
-        return [](const void *arg) -> void * {
-            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
-        };
-    }
+  template <typename T,
+            typename = enable_if_t<std::is_move_constructible<T>::value>>
+  static auto make_move_constructor(const T *x)
+      -> decltype(new T(std::move(*const_cast<T *>(x))), Constructor{}) {
+    return [](const void *arg) -> void * {
+      return new T(
+          std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+    };
+  }
 
-    static Constructor make_copy_constructor(...) { return nullptr; }
-    static Constructor make_move_constructor(...) { return nullptr; }
+  static Constructor make_copy_constructor(...) { return nullptr; }
+  static Constructor make_move_constructor(...) { return nullptr; }
 };
 
-template <typename type, typename SFINAE = void> class type_caster : public type_caster_base<type> { };
+template <typename type, typename SFINAE = void>
+class type_caster : public type_caster_base<type> {};
 template <typename type> using make_caster = type_caster<intrinsic_t<type>>;
 
-// Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T
-template <typename T> typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &caster) {
-    return caster.operator typename make_caster<T>::template cast_op_type<T>();
+// Shortcut for calling a caster's `cast_op_type` cast operator for casting a
+// type_caster to a T
+template <typename T>
+typename make_caster<T>::template cast_op_type<T>
+cast_op(make_caster<T> &caster) {
+  return caster.operator typename make_caster<T>::template cast_op_type<T>();
 }
-template <typename T> typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>
+template <typename T>
+typename make_caster<T>::template cast_op_type<
+    typename std::add_rvalue_reference<T>::type>
 cast_op(make_caster<T> &&caster) {
-    return std::move(caster).operator
-        typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>();
+  return std::move(caster).operator typename make_caster<T>::
+      template cast_op_type<typename std::add_rvalue_reference<T>::type>();
 }
 
 template <typename type> class type_caster<std::reference_wrapper<type>> {
 private:
-    using caster_t = make_caster<type>;
-    caster_t subcaster;
-    using subcaster_cast_op_type = typename caster_t::template cast_op_type<type>;
-    static_assert(std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value,
-            "std::reference_wrapper<T> caster requires T to have a caster with an `T &` operator");
+  using caster_t = make_caster<type>;
+  caster_t subcaster;
+  using subcaster_cast_op_type = typename caster_t::template cast_op_type<type>;
+  static_assert(std::is_same<typename std::remove_const<type>::type &,
+                             subcaster_cast_op_type>::value,
+                "std::reference_wrapper<T> caster requires T to have a caster "
+                "with an `T &` operator");
+
 public:
-    bool load(handle src, bool convert) { return subcaster.load(src, convert); }
-    static constexpr auto name = caster_t::name;
-    static handle cast(const std::reference_wrapper<type> &src, return_value_policy policy, handle parent) {
-        // It is definitely wrong to take ownership of this pointer, so mask that rvp
-        if (policy == return_value_policy::take_ownership || policy == return_value_policy::automatic)
-            policy = return_value_policy::automatic_reference;
-        return caster_t::cast(&src.get(), policy, parent);
-    }
-    template <typename T> using cast_op_type = std::reference_wrapper<type>;
-    operator std::reference_wrapper<type>() { return subcaster.operator subcaster_cast_op_type&(); }
+  bool load(handle src, bool convert) { return subcaster.load(src, convert); }
+  static constexpr auto name = caster_t::name;
+  static handle cast(const std::reference_wrapper<type> &src,
+                     return_value_policy policy, handle parent) {
+    // It is definitely wrong to take ownership of this pointer, so mask that
+    // rvp
+    if (policy == return_value_policy::take_ownership ||
+        policy == return_value_policy::automatic)
+      policy = return_value_policy::automatic_reference;
+    return caster_t::cast(&src.get(), policy, parent);
+  }
+  template <typename T> using cast_op_type = std::reference_wrapper<type>;
+  operator std::reference_wrapper<type>() {
+    return subcaster.operator subcaster_cast_op_type &();
+  }
 };
 
-#define PYBIND11_TYPE_CASTER(type, py_name) \
-    protected: \
-        type value; \
-    public: \
-        static constexpr auto name = py_name; \
-        template <typename T_, enable_if_t<std::is_same<type, remove_cv_t<T_>>::value, int> = 0> \
-        static handle cast(T_ *src, return_value_policy policy, handle parent) { \
-            if (!src) return none().release(); \
-            if (policy == return_value_policy::take_ownership) { \
-                auto h = cast(std::move(*src), policy, parent); delete src; return h; \
-            } else { \
-                return cast(*src, policy, parent); \
-            } \
-        } \
-        operator type*() { return &value; } \
-        operator type&() { return value; } \
-        operator type&&() && { return std::move(value); } \
-        template <typename T_> using cast_op_type = pybind11::detail::movable_cast_op_type<T_>
+#define PYBIND11_TYPE_CASTER(type, py_name)                                    \
+protected:                                                                     \
+  type value;                                                                  \
+                                                                               \
+public:                                                                        \
+  static constexpr auto name = py_name;                                        \
+  template <typename T_,                                                       \
+            enable_if_t<std::is_same<type, remove_cv_t<T_>>::value, int> = 0>  \
+  static handle cast(T_ *src, return_value_policy policy, handle parent) {     \
+    if (!src)                                                                  \
+      return none().release();                                                 \
+    if (policy == return_value_policy::take_ownership) {                       \
+      auto h = cast(std::move(*src), policy, parent);                          \
+      delete src;                                                              \
+      return h;                                                                \
+    } else {                                                                   \
+      return cast(*src, policy, parent);                                       \
+    }                                                                          \
+  }                                                                            \
+  operator type *() { return &value; }                                         \
+  operator type &() { return value; }                                          \
+  operator type &&() && { return std::move(value); }                           \
+  template <typename T_>                                                       \
+  using cast_op_type = pybind11::detail::movable_cast_op_type<T_>
 
-
-template <typename CharT> using is_std_char_type = any_of<
-    std::is_same<CharT, char>, /* std::string */
-    std::is_same<CharT, char16_t>, /* std::u16string */
-    std::is_same<CharT, char32_t>, /* std::u32string */
-    std::is_same<CharT, wchar_t> /* std::wstring */
->;
+template <typename CharT>
+using is_std_char_type =
+    any_of<std::is_same<CharT, char>,     /* std::string */
+           std::is_same<CharT, char16_t>, /* std::u16string */
+           std::is_same<CharT, char32_t>, /* std::u32string */
+           std::is_same<CharT, wchar_t>   /* std::wstring */
+           >;
 
 template <typename T>
-struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
-    using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
-    using _py_type_1 = conditional_t<std::is_signed<T>::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>;
-    using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
+struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value &&
+                                  !is_std_char_type<T>::value>> {
+  using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
+  using _py_type_1 =
+      conditional_t<std::is_signed<T>::value, _py_type_0,
+                    typename std::make_unsigned<_py_type_0>::type>;
+  using py_type =
+      conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
+
 public:
+  bool load(handle src, bool convert) {
+    py_type py_value;
 
-    bool load(handle src, bool convert) {
-        py_type py_value;
+    if (!src)
+      return false;
 
-        if (!src)
-            return false;
+    if (std::is_floating_point<T>::value) {
+      if (convert || PyFloat_Check(src.ptr()))
+        py_value = (py_type)PyFloat_AsDouble(src.ptr());
+      else
+        return false;
+    } else if (PyFloat_Check(src.ptr())) {
+      return false;
+    } else if (std::is_unsigned<py_type>::value) {
+      py_value = as_unsigned<py_type>(src.ptr());
+    } else { // signed integer:
+      py_value = sizeof(T) <= sizeof(long)
+                     ? (py_type)PyLong_AsLong(src.ptr())
+                     : (py_type)PYBIND11_LONG_AS_LONGLONG(src.ptr());
+    }
 
-        if (std::is_floating_point<T>::value) {
-            if (convert || PyFloat_Check(src.ptr()))
-                py_value = (py_type) PyFloat_AsDouble(src.ptr());
-            else
-                return false;
-        } else if (PyFloat_Check(src.ptr())) {
-            return false;
-        } else if (std::is_unsigned<py_type>::value) {
-            py_value = as_unsigned<py_type>(src.ptr());
-        } else { // signed integer:
-            py_value = sizeof(T) <= sizeof(long)
-                ? (py_type) PyLong_AsLong(src.ptr())
-                : (py_type) PYBIND11_LONG_AS_LONGLONG(src.ptr());
-        }
-
-        bool py_err = py_value == (py_type) -1 && PyErr_Occurred();
-        if (py_err || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) &&
-                       (py_value < (py_type) std::numeric_limits<T>::min() ||
-                        py_value > (py_type) std::numeric_limits<T>::max()))) {
-            bool type_error = py_err && PyErr_ExceptionMatches(
+    bool py_err = py_value == (py_type)-1 && PyErr_Occurred();
+    if (py_err || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) &&
+                   (py_value < (py_type)std::numeric_limits<T>::min() ||
+                    py_value > (py_type)std::numeric_limits<T>::max()))) {
+      bool type_error = py_err && PyErr_ExceptionMatches(
 #if PY_VERSION_HEX < 0x03000000 && !defined(PYPY_VERSION)
-                PyExc_SystemError
+                                      PyExc_SystemError
 #else
-                PyExc_TypeError
+                                      PyExc_TypeError
 #endif
-            );
-            PyErr_Clear();
-            if (type_error && convert && PyNumber_Check(src.ptr())) {
-                auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
-                                                     ? PyNumber_Float(src.ptr())
-                                                     : PyNumber_Long(src.ptr()));
-                PyErr_Clear();
-                return load(tmp, false);
-            }
-            return false;
-        }
-
-        value = (T) py_value;
-        return true;
+                                  );
+      PyErr_Clear();
+      if (type_error && convert && PyNumber_Check(src.ptr())) {
+        auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
+                                                 ? PyNumber_Float(src.ptr())
+                                                 : PyNumber_Long(src.ptr()));
+        PyErr_Clear();
+        return load(tmp, false);
+      }
+      return false;
     }
 
-    template<typename U = T>
-    static typename std::enable_if<std::is_floating_point<U>::value, handle>::type
-    cast(U src, return_value_policy /* policy */, handle /* parent */) {
-        return PyFloat_FromDouble((double) src);
-    }
+    value = (T)py_value;
+    return true;
+  }
 
-    template<typename U = T>
-    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value && (sizeof(U) <= sizeof(long)), handle>::type
-    cast(U src, return_value_policy /* policy */, handle /* parent */) {
-        return PYBIND11_LONG_FROM_SIGNED((long) src);
-    }
+  template <typename U = T>
+  static typename std::enable_if<std::is_floating_point<U>::value, handle>::type
+  cast(U src, return_value_policy /* policy */, handle /* parent */) {
+    return PyFloat_FromDouble((double)src);
+  }
 
-    template<typename U = T>
-    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value && (sizeof(U) <= sizeof(unsigned long)), handle>::type
-    cast(U src, return_value_policy /* policy */, handle /* parent */) {
-        return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src);
-    }
+  template <typename U = T>
+  static typename std::enable_if<!std::is_floating_point<U>::value &&
+                                     std::is_signed<U>::value &&
+                                     (sizeof(U) <= sizeof(long)),
+                                 handle>::type
+  cast(U src, return_value_policy /* policy */, handle /* parent */) {
+    return PYBIND11_LONG_FROM_SIGNED((long)src);
+  }
 
-    template<typename U = T>
-    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value && (sizeof(U) > sizeof(long)), handle>::type
-    cast(U src, return_value_policy /* policy */, handle /* parent */) {
-        return PyLong_FromLongLong((long long) src);
-    }
+  template <typename U = T>
+  static typename std::enable_if<!std::is_floating_point<U>::value &&
+                                     std::is_unsigned<U>::value &&
+                                     (sizeof(U) <= sizeof(unsigned long)),
+                                 handle>::type
+  cast(U src, return_value_policy /* policy */, handle /* parent */) {
+    return PYBIND11_LONG_FROM_UNSIGNED((unsigned long)src);
+  }
 
-    template<typename U = T>
-    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value && (sizeof(U) > sizeof(unsigned long)), handle>::type
-    cast(U src, return_value_policy /* policy */, handle /* parent */) {
-        return PyLong_FromUnsignedLongLong((unsigned long long) src);
-    }
+  template <typename U = T>
+  static typename std::enable_if<!std::is_floating_point<U>::value &&
+                                     std::is_signed<U>::value &&
+                                     (sizeof(U) > sizeof(long)),
+                                 handle>::type
+  cast(U src, return_value_policy /* policy */, handle /* parent */) {
+    return PyLong_FromLongLong((long long)src);
+  }
 
-    PYBIND11_TYPE_CASTER(T, _<std::is_integral<T>::value>("int", "float"));
+  template <typename U = T>
+  static typename std::enable_if<!std::is_floating_point<U>::value &&
+                                     std::is_unsigned<U>::value &&
+                                     (sizeof(U) > sizeof(unsigned long)),
+                                 handle>::type
+  cast(U src, return_value_policy /* policy */, handle /* parent */) {
+    return PyLong_FromUnsignedLongLong((unsigned long long)src);
+  }
+
+  PYBIND11_TYPE_CASTER(T, _<std::is_integral<T>::value>("int", "float"));
 };
 
-template<typename T> struct void_caster {
+template <typename T> struct void_caster {
 public:
-    bool load(handle src, bool) {
-        if (src && src.is_none())
-            return true;
-        return false;
-    }
-    static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
-        return none().inc_ref();
-    }
-    PYBIND11_TYPE_CASTER(T, _("None"));
+  bool load(handle src, bool) {
+    if (src && src.is_none())
+      return true;
+    return false;
+  }
+  static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
+    return none().inc_ref();
+  }
+  PYBIND11_TYPE_CASTER(T, _("None"));
 };
 
 template <> class type_caster<void_type> : public void_caster<void_type> {};
 
 template <> class type_caster<void> : public type_caster<void_type> {
 public:
-    using type_caster<void_type>::cast;
+  using type_caster<void_type>::cast;
 
-    bool load(handle h, bool) {
-        if (!h) {
-            return false;
-        } else if (h.is_none()) {
-            value = nullptr;
-            return true;
-        }
-
-        /* Check if this is a capsule */
-        if (isinstance<capsule>(h)) {
-            value = reinterpret_borrow<capsule>(h);
-            return true;
-        }
-
-        /* Check if this is a C++ type */
-        auto &bases = all_type_info((PyTypeObject *) h.get_type().ptr());
-        if (bases.size() == 1) { // Only allowing loading from a single-value type
-            value = values_and_holders(reinterpret_cast<instance *>(h.ptr())).begin()->value_ptr();
-            return true;
-        }
-
-        /* Fail */
-        return false;
+  bool load(handle h, bool) {
+    if (!h) {
+      return false;
+    } else if (h.is_none()) {
+      value = nullptr;
+      return true;
     }
 
-    static handle cast(const void *ptr, return_value_policy /* policy */, handle /* parent */) {
-        if (ptr)
-            return capsule(ptr).release();
-        else
-            return none().inc_ref();
+    /* Check if this is a capsule */
+    if (isinstance<capsule>(h)) {
+      value = reinterpret_borrow<capsule>(h);
+      return true;
     }
 
-    template <typename T> using cast_op_type = void*&;
-    operator void *&() { return value; }
-    static constexpr auto name = _("capsule");
+    /* Check if this is a C++ type */
+    auto &bases = all_type_info((PyTypeObject *)h.get_type().ptr());
+    if (bases.size() == 1) { // Only allowing loading from a single-value type
+      value = values_and_holders(reinterpret_cast<instance *>(h.ptr()))
+                  .begin()
+                  ->value_ptr();
+      return true;
+    }
+
+    /* Fail */
+    return false;
+  }
+
+  static handle cast(const void *ptr, return_value_policy /* policy */,
+                     handle /* parent */) {
+    if (ptr)
+      return capsule(ptr).release();
+    else
+      return none().inc_ref();
+  }
+
+  template <typename T> using cast_op_type = void *&;
+  operator void * &() { return value; }
+  static constexpr auto name = _("capsule");
+
 private:
-    void *value = nullptr;
+  void *value = nullptr;
 };
 
-template <> class type_caster<std::nullptr_t> : public void_caster<std::nullptr_t> { };
+template <>
+class type_caster<std::nullptr_t> : public void_caster<std::nullptr_t> {};
 
 template <> class type_caster<bool> {
 public:
-    bool load(handle src, bool convert) {
-        if (!src) return false;
-        else if (src.ptr() == Py_True) { value = true; return true; }
-        else if (src.ptr() == Py_False) { value = false; return true; }
-        else if (convert || !strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name)) {
-            // (allow non-implicit conversion for numpy booleans)
+  bool load(handle src, bool convert) {
+    if (!src)
+      return false;
+    else if (src.ptr() == Py_True) {
+      value = true;
+      return true;
+    } else if (src.ptr() == Py_False) {
+      value = false;
+      return true;
+    } else if (convert || !strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name)) {
+      // (allow non-implicit conversion for numpy booleans)
 
-            Py_ssize_t res = -1;
-            if (src.is_none()) {
-                res = 0;  // None is implicitly converted to False
-            }
-            #if defined(PYPY_VERSION)
-            // On PyPy, check that "__bool__" (or "__nonzero__" on Python 2.7) attr exists
-            else if (hasattr(src, PYBIND11_BOOL_ATTR)) {
-                res = PyObject_IsTrue(src.ptr());
-            }
-            #else
-            // Alternate approach for CPython: this does the same as the above, but optimized
-            // using the CPython API so as to avoid an unneeded attribute lookup.
-            else if (auto tp_as_number = src.ptr()->ob_type->tp_as_number) {
-                if (PYBIND11_NB_BOOL(tp_as_number)) {
-                    res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr());
-                }
-            }
-            #endif
-            if (res == 0 || res == 1) {
-                value = (bool) res;
-                return true;
-            }
+      Py_ssize_t res = -1;
+      if (src.is_none()) {
+        res = 0; // None is implicitly converted to False
+      }
+#if defined(PYPY_VERSION)
+      // On PyPy, check that "__bool__" (or "__nonzero__" on Python 2.7) attr
+      // exists
+      else if (hasattr(src, PYBIND11_BOOL_ATTR)) {
+        res = PyObject_IsTrue(src.ptr());
+      }
+#else
+      // Alternate approach for CPython: this does the same as the above, but
+      // optimized using the CPython API so as to avoid an unneeded attribute
+      // lookup.
+      else if (auto tp_as_number = src.ptr()->ob_type->tp_as_number) {
+        if (PYBIND11_NB_BOOL(tp_as_number)) {
+          res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr());
         }
-        return false;
+      }
+#endif
+      if (res == 0 || res == 1) {
+        value = (bool)res;
+        return true;
+      }
     }
-    static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) {
-        return handle(src ? Py_True : Py_False).inc_ref();
-    }
-    PYBIND11_TYPE_CASTER(bool, _("bool"));
+    return false;
+  }
+  static handle cast(bool src, return_value_policy /* policy */,
+                     handle /* parent */) {
+    return handle(src ? Py_True : Py_False).inc_ref();
+  }
+  PYBIND11_TYPE_CASTER(bool, _("bool"));
 };
 
 // Helper class for UTF-{8,16,32} C++ stl strings:
 template <typename StringType, bool IsView = false> struct string_caster {
-    using CharT = typename StringType::value_type;
+  using CharT = typename StringType::value_type;
 
-    // Simplify life by being able to assume standard char sizes (the standard only guarantees
-    // minimums, but Python requires exact sizes)
-    static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
-    static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
-    static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
-    // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
-    static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
-            "Unsupported wchar_t size != 2/4");
-    static constexpr size_t UTF_N = 8 * sizeof(CharT);
+  // Simplify life by being able to assume standard char sizes (the standard
+  // only guarantees minimums, but Python requires exact sizes)
+  static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1,
+                "Unsupported char size != 1");
+  static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2,
+                "Unsupported char16_t size != 2");
+  static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4,
+                "Unsupported char32_t size != 4");
+  // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
+  static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 ||
+                    sizeof(CharT) == 4,
+                "Unsupported wchar_t size != 2/4");
+  static constexpr size_t UTF_N = 8 * sizeof(CharT);
 
-    bool load(handle src, bool) {
+  bool load(handle src, bool) {
 #if PY_MAJOR_VERSION < 3
-        object temp;
+    object temp;
 #endif
-        handle load_src = src;
-        if (!src) {
-            return false;
-        } else if (!PyUnicode_Check(load_src.ptr())) {
+    handle load_src = src;
+    if (!src) {
+      return false;
+    } else if (!PyUnicode_Check(load_src.ptr())) {
 #if PY_MAJOR_VERSION >= 3
-            return load_bytes(load_src);
+      return load_bytes(load_src);
 #else
-            if (sizeof(CharT) == 1) {
-                return load_bytes(load_src);
-            }
+      if (sizeof(CharT) == 1) {
+        return load_bytes(load_src);
+      }
 
-            // The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false
-            if (!PYBIND11_BYTES_CHECK(load_src.ptr()))
-                return false;
+      // The below is a guaranteed failure in Python 3 when PyUnicode_Check
+      // returns false
+      if (!PYBIND11_BYTES_CHECK(load_src.ptr()))
+        return false;
 
-            temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
-            if (!temp) { PyErr_Clear(); return false; }
-            load_src = temp;
+      temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
+      if (!temp) {
+        PyErr_Clear();
+        return false;
+      }
+      load_src = temp;
 #endif
-        }
-
-        object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
-            load_src.ptr(), UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr));
-        if (!utfNbytes) { PyErr_Clear(); return false; }
-
-        const CharT *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
-        size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
-        if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
-        value = StringType(buffer, length);
-
-        // If we're loading a string_view we need to keep the encoded Python object alive:
-        if (IsView)
-            loader_life_support::add_patient(utfNbytes);
-
-        return true;
     }
 
-    static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
-        const char *buffer = reinterpret_cast<const char *>(src.data());
-        ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
-        handle s = decode_utfN(buffer, nbytes);
-        if (!s) throw error_already_set();
-        return s;
+    object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
+        load_src.ptr(),
+        UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr));
+    if (!utfNbytes) {
+      PyErr_Clear();
+      return false;
     }
 
-    PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
+    const CharT *buffer = reinterpret_cast<const CharT *>(
+        PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
+    size_t length =
+        (size_t)PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
+    if (UTF_N > 8) {
+      buffer++;
+      length--;
+    } // Skip BOM for UTF-16/32
+    value = StringType(buffer, length);
+
+    // If we're loading a string_view we need to keep the encoded Python object
+    // alive:
+    if (IsView)
+      loader_life_support::add_patient(utfNbytes);
+
+    return true;
+  }
+
+  static handle cast(const StringType &src, return_value_policy /* policy */,
+                     handle /* parent */) {
+    const char *buffer = reinterpret_cast<const char *>(src.data());
+    ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
+    handle s = decode_utfN(buffer, nbytes);
+    if (!s)
+      throw error_already_set();
+    return s;
+  }
+
+  PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
 
 private:
-    static handle decode_utfN(const char *buffer, ssize_t nbytes) {
+  static handle decode_utfN(const char *buffer, ssize_t nbytes) {
 #if !defined(PYPY_VERSION)
-        return
-            UTF_N == 8  ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr) :
-            UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr) :
-                          PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
+    return UTF_N == 8
+               ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr)
+               : UTF_N == 16
+                     ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr)
+                     : PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
 #else
-        // PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8 version
-        // sometimes segfaults for unknown reasons, while the UTF16 and 32 versions require a
-        // non-const char * arguments, which is also a nuisance, so bypass the whole thing by just
-        // passing the encoding as a string value, which works properly:
-        return PyUnicode_Decode(buffer, nbytes, UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr);
+    // PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8
+    // version sometimes segfaults for unknown reasons, while the UTF16 and 32
+    // versions require a non-const char * arguments, which is also a nuisance,
+    // so bypass the whole thing by just passing the encoding as a string value,
+    // which works properly:
+    return PyUnicode_Decode(
+        buffer, nbytes,
+        UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr);
 #endif
+  }
+
+  // When loading into a std::string or char*, accept a bytes object as-is (i.e.
+  // without any encoding/decoding attempt).  For other C++ char sizes this is a
+  // no-op. which supports loading a unicode from a str, doesn't take this path.
+  template <typename C = CharT>
+  bool load_bytes(enable_if_t<sizeof(C) == 1, handle> src) {
+    if (PYBIND11_BYTES_CHECK(src.ptr())) {
+      // We were passed a Python 3 raw bytes; accept it into a std::string or
+      // char* without any encoding attempt.
+      const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());
+      if (bytes) {
+        value = StringType(bytes, (size_t)PYBIND11_BYTES_SIZE(src.ptr()));
+        return true;
+      }
     }
 
-    // When loading into a std::string or char*, accept a bytes object as-is (i.e.
-    // without any encoding/decoding attempt).  For other C++ char sizes this is a no-op.
-    // which supports loading a unicode from a str, doesn't take this path.
-    template <typename C = CharT>
-    bool load_bytes(enable_if_t<sizeof(C) == 1, handle> src) {
-        if (PYBIND11_BYTES_CHECK(src.ptr())) {
-            // We were passed a Python 3 raw bytes; accept it into a std::string or char*
-            // without any encoding attempt.
-            const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());
-            if (bytes) {
-                value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr()));
-                return true;
-            }
-        }
+    return false;
+  }
 
-        return false;
-    }
-
-    template <typename C = CharT>
-    bool load_bytes(enable_if_t<sizeof(C) != 1, handle>) { return false; }
+  template <typename C = CharT>
+  bool load_bytes(enable_if_t<sizeof(C) != 1, handle>) {
+    return false;
+  }
 };
 
 template <typename CharT, class Traits, class Allocator>
-struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>>
+struct type_caster<std::basic_string<CharT, Traits, Allocator>,
+                   enable_if_t<is_std_char_type<CharT>::value>>
     : string_caster<std::basic_string<CharT, Traits, Allocator>> {};
 
 #ifdef PYBIND11_HAS_STRING_VIEW
 template <typename CharT, class Traits>
-struct type_caster<std::basic_string_view<CharT, Traits>, enable_if_t<is_std_char_type<CharT>::value>>
+struct type_caster<std::basic_string_view<CharT, Traits>,
+                   enable_if_t<is_std_char_type<CharT>::value>>
     : string_caster<std::basic_string_view<CharT, Traits>, true> {};
 #endif
 
-// Type caster for C-style strings.  We basically use a std::string type caster, but also add the
-// ability to use None as a nullptr char* (which the string caster doesn't allow).
-template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
-    using StringType = std::basic_string<CharT>;
-    using StringCaster = type_caster<StringType>;
-    StringCaster str_caster;
-    bool none = false;
-    CharT one_char = 0;
+// Type caster for C-style strings.  We basically use a std::string type caster,
+// but also add the ability to use None as a nullptr char* (which the string
+// caster doesn't allow).
+template <typename CharT>
+struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
+  using StringType = std::basic_string<CharT>;
+  using StringCaster = type_caster<StringType>;
+  StringCaster str_caster;
+  bool none = false;
+  CharT one_char = 0;
+
 public:
-    bool load(handle src, bool convert) {
-        if (!src) return false;
-        if (src.is_none()) {
-            // Defer accepting None to other overloads (if we aren't in convert mode):
-            if (!convert) return false;
-            none = true;
-            return true;
+  bool load(handle src, bool convert) {
+    if (!src)
+      return false;
+    if (src.is_none()) {
+      // Defer accepting None to other overloads (if we aren't in convert mode):
+      if (!convert)
+        return false;
+      none = true;
+      return true;
+    }
+    return str_caster.load(src, convert);
+  }
+
+  static handle cast(const CharT *src, return_value_policy policy,
+                     handle parent) {
+    if (src == nullptr)
+      return pybind11::none().inc_ref();
+    return StringCaster::cast(StringType(src), policy, parent);
+  }
+
+  static handle cast(CharT src, return_value_policy policy, handle parent) {
+    if (std::is_same<char, CharT>::value) {
+      handle s = PyUnicode_DecodeLatin1((const char *)&src, 1, nullptr);
+      if (!s)
+        throw error_already_set();
+      return s;
+    }
+    return StringCaster::cast(StringType(1, src), policy, parent);
+  }
+
+  operator CharT *() {
+    return none ? nullptr
+                : const_cast<CharT *>(
+                      static_cast<StringType &>(str_caster).c_str());
+  }
+  operator CharT &() {
+    if (none)
+      throw value_error("Cannot convert None to a character");
+
+    auto &value = static_cast<StringType &>(str_caster);
+    size_t str_len = value.size();
+    if (str_len == 0)
+      throw value_error("Cannot convert empty string to a character");
+
+    // If we're in UTF-8 mode, we have two possible failures: one for a unicode
+    // character that is too high, and one for multiple unicode characters
+    // (caught later), so we need to figure out how long the first encoded
+    // character is in bytes to distinguish between these two errors.  We also
+    // allow want to allow unicode characters U+0080 through U+00FF, as those
+    // can fit into a single char value.
+    if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
+      unsigned char v0 = static_cast<unsigned char>(value[0]);
+      size_t char0_bytes =
+          !(v0 & 0x80) ? 1 :            // low bits only: 0-127
+              (v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
+                  (v0 & 0xF0) == 0xE0 ? 3
+                                      : // 0b1110xxxx - start of 3-byte sequence
+                      4;                // 0b11110xxx - start of 4-byte sequence
+
+      if (char0_bytes == str_len) {
+        // If we have a 128-255 value, we can decode it into a single char:
+        if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
+          one_char = static_cast<CharT>(
+              ((v0 & 3) << 6) + (static_cast<unsigned char>(value[1]) & 0x3F));
+          return one_char;
         }
-        return str_caster.load(src, convert);
+        // Otherwise we have a single character, but it's > U+00FF
+        throw value_error("Character code point not in range(0x100)");
+      }
     }
 
-    static handle cast(const CharT *src, return_value_policy policy, handle parent) {
-        if (src == nullptr) return pybind11::none().inc_ref();
-        return StringCaster::cast(StringType(src), policy, parent);
+    // UTF-16 is much easier: we can only have a surrogate pair for values above
+    // U+FFFF, thus a surrogate pair with total length 2 instantly indicates a
+    // range error (but not a "your string was too long" error).
+    else if (StringCaster::UTF_N == 16 && str_len == 2) {
+      one_char = static_cast<CharT>(value[0]);
+      if (one_char >= 0xD800 && one_char < 0xE000)
+        throw value_error("Character code point not in range(0x10000)");
     }
 
-    static handle cast(CharT src, return_value_policy policy, handle parent) {
-        if (std::is_same<char, CharT>::value) {
-            handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
-            if (!s) throw error_already_set();
-            return s;
-        }
-        return StringCaster::cast(StringType(1, src), policy, parent);
-    }
+    if (str_len != 1)
+      throw value_error(
+          "Expected a character, but multi-character string found");
 
-    operator CharT*() { return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str()); }
-    operator CharT&() {
-        if (none)
-            throw value_error("Cannot convert None to a character");
+    one_char = value[0];
+    return one_char;
+  }
 
-        auto &value = static_cast<StringType &>(str_caster);
-        size_t str_len = value.size();
-        if (str_len == 0)
-            throw value_error("Cannot convert empty string to a character");
-
-        // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
-        // is too high, and one for multiple unicode characters (caught later), so we need to figure
-        // out how long the first encoded character is in bytes to distinguish between these two
-        // errors.  We also allow want to allow unicode characters U+0080 through U+00FF, as those
-        // can fit into a single char value.
-        if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
-            unsigned char v0 = static_cast<unsigned char>(value[0]);
-            size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127
-                (v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
-                (v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence
-                4; // 0b11110xxx - start of 4-byte sequence
-
-            if (char0_bytes == str_len) {
-                // If we have a 128-255 value, we can decode it into a single char:
-                if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
-                    one_char = static_cast<CharT>(((v0 & 3) << 6) + (static_cast<unsigned char>(value[1]) & 0x3F));
-                    return one_char;
-                }
-                // Otherwise we have a single character, but it's > U+00FF
-                throw value_error("Character code point not in range(0x100)");
-            }
-        }
-
-        // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
-        // surrogate pair with total length 2 instantly indicates a range error (but not a "your
-        // string was too long" error).
-        else if (StringCaster::UTF_N == 16 && str_len == 2) {
-            one_char = static_cast<CharT>(value[0]);
-            if (one_char >= 0xD800 && one_char < 0xE000)
-                throw value_error("Character code point not in range(0x10000)");
-        }
-
-        if (str_len != 1)
-            throw value_error("Expected a character, but multi-character string found");
-
-        one_char = value[0];
-        return one_char;
-    }
-
-    static constexpr auto name = _(PYBIND11_STRING_NAME);
-    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+  static constexpr auto name = _(PYBIND11_STRING_NAME);
+  template <typename _T>
+  using cast_op_type = pybind11::detail::cast_op_type<_T>;
 };
 
 // Base implementation for std::tuple and std::pair
-template <template<typename...> class Tuple, typename... Ts> class tuple_caster {
-    using type = Tuple<Ts...>;
-    static constexpr auto size = sizeof...(Ts);
-    using indices = make_index_sequence<size>;
+template <template <typename...> class Tuple, typename... Ts>
+class tuple_caster {
+  using type = Tuple<Ts...>;
+  static constexpr auto size = sizeof...(Ts);
+  using indices = make_index_sequence<size>;
+
 public:
+  bool load(handle src, bool convert) {
+    if (!isinstance<sequence>(src))
+      return false;
+    const auto seq = reinterpret_borrow<sequence>(src);
+    if (seq.size() != size)
+      return false;
+    return load_impl(seq, convert, indices{});
+  }
 
-    bool load(handle src, bool convert) {
-        if (!isinstance<sequence>(src))
-            return false;
-        const auto seq = reinterpret_borrow<sequence>(src);
-        if (seq.size() != size)
-            return false;
-        return load_impl(seq, convert, indices{});
-    }
+  template <typename T>
+  static handle cast(T &&src, return_value_policy policy, handle parent) {
+    return cast_impl(std::forward<T>(src), policy, parent, indices{});
+  }
 
-    template <typename T>
-    static handle cast(T &&src, return_value_policy policy, handle parent) {
-        return cast_impl(std::forward<T>(src), policy, parent, indices{});
-    }
+  static constexpr auto name =
+      _("Tuple[") + concat(make_caster<Ts>::name...) + _("]");
 
-    static constexpr auto name = _("Tuple[") + concat(make_caster<Ts>::name...) + _("]");
+  template <typename T> using cast_op_type = type;
 
-    template <typename T> using cast_op_type = type;
-
-    operator type() & { return implicit_cast(indices{}); }
-    operator type() && { return std::move(*this).implicit_cast(indices{}); }
+  operator type() & { return implicit_cast(indices{}); }
+  operator type() && { return std::move(*this).implicit_cast(indices{}); }
 
 protected:
-    template <size_t... Is>
-    type implicit_cast(index_sequence<Is...>) & { return type(cast_op<Ts>(std::get<Is>(subcasters))...); }
-    template <size_t... Is>
-    type implicit_cast(index_sequence<Is...>) && { return type(cast_op<Ts>(std::move(std::get<Is>(subcasters)))...); }
+  template <size_t... Is> type implicit_cast(index_sequence<Is...>) & {
+    return type(cast_op<Ts>(std::get<Is>(subcasters))...);
+  }
+  template <size_t... Is> type implicit_cast(index_sequence<Is...>) && {
+    return type(cast_op<Ts>(std::move(std::get<Is>(subcasters)))...);
+  }
 
-    static constexpr bool load_impl(const sequence &, bool, index_sequence<>) { return true; }
+  static constexpr bool load_impl(const sequence &, bool, index_sequence<>) {
+    return true;
+  }
 
-    template <size_t... Is>
-    bool load_impl(const sequence &seq, bool convert, index_sequence<Is...>) {
-        for (bool r : {std::get<Is>(subcasters).load(seq[Is], convert)...})
-            if (!r)
-                return false;
-        return true;
-    }
+  template <size_t... Is>
+  bool load_impl(const sequence &seq, bool convert, index_sequence<Is...>) {
+    for (bool r : {std::get<Is>(subcasters).load(seq[Is], convert)...})
+      if (!r)
+        return false;
+    return true;
+  }
 
-    /* Implementation: Convert a C++ tuple into a Python tuple */
-    template <typename T, size_t... Is>
-    static handle cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence<Is...>) {
-        std::array<object, size> entries{{
-            reinterpret_steal<object>(make_caster<Ts>::cast(std::get<Is>(std::forward<T>(src)), policy, parent))...
-        }};
-        for (const auto &entry: entries)
-            if (!entry)
-                return handle();
-        tuple result(size);
-        int counter = 0;
-        for (auto & entry: entries)
-            PyTuple_SET_ITEM(result.ptr(), counter++, entry.release().ptr());
-        return result.release();
-    }
+  /* Implementation: Convert a C++ tuple into a Python tuple */
+  template <typename T, size_t... Is>
+  static handle cast_impl(T &&src, return_value_policy policy, handle parent,
+                          index_sequence<Is...>) {
+    std::array<object, size> entries{
+        {reinterpret_steal<object>(make_caster<Ts>::cast(
+            std::get<Is>(std::forward<T>(src)), policy, parent))...}};
+    for (const auto &entry : entries)
+      if (!entry)
+        return handle();
+    tuple result(size);
+    int counter = 0;
+    for (auto &entry : entries)
+      PyTuple_SET_ITEM(result.ptr(), counter++, entry.release().ptr());
+    return result.release();
+  }
 
-    Tuple<make_caster<Ts>...> subcasters;
+  Tuple<make_caster<Ts>...> subcasters;
 };
 
-template <typename T1, typename T2> class type_caster<std::pair<T1, T2>>
-    : public tuple_caster<std::pair, T1, T2> {};
+template <typename T1, typename T2>
+class type_caster<std::pair<T1, T2>> : public tuple_caster<std::pair, T1, T2> {
+};
 
-template <typename... Ts> class type_caster<std::tuple<Ts...>>
-    : public tuple_caster<std::tuple, Ts...> {};
+template <typename... Ts>
+class type_caster<std::tuple<Ts...>> : public tuple_caster<std::tuple, Ts...> {
+};
 
-/// Helper class which abstracts away certain actions. Users can provide specializations for
-/// custom holders, but it's only necessary if the type has a non-standard interface.
-template <typename T>
-struct holder_helper {
-    static auto get(const T &p) -> decltype(p.get()) { return p.get(); }
+/// Helper class which abstracts away certain actions. Users can provide
+/// specializations for custom holders, but it's only necessary if the type has
+/// a non-standard interface.
+template <typename T> struct holder_helper {
+  static auto get(const T &p) -> decltype(p.get()) { return p.get(); }
 };
 
 /// Type caster for holder types like std::shared_ptr, etc.
 template <typename type, typename holder_type>
 struct copyable_holder_caster : public type_caster_base<type> {
 public:
-    using base = type_caster_base<type>;
-    static_assert(std::is_base_of<base, type_caster<type>>::value,
-            "Holder classes are only supported for custom types");
-    using base::base;
-    using base::cast;
-    using base::typeinfo;
-    using base::value;
+  using base = type_caster_base<type>;
+  static_assert(std::is_base_of<base, type_caster<type>>::value,
+                "Holder classes are only supported for custom types");
+  using base::base;
+  using base::cast;
+  using base::typeinfo;
+  using base::value;
 
-    bool load(handle src, bool convert) {
-        return base::template load_impl<copyable_holder_caster<type, holder_type>>(src, convert);
-    }
+  bool load(handle src, bool convert) {
+    return base::template load_impl<copyable_holder_caster<type, holder_type>>(
+        src, convert);
+  }
 
-    explicit operator type*() { return this->value; }
-    explicit operator type&() { return *(this->value); }
-    explicit operator holder_type*() { return std::addressof(holder); }
+  explicit operator type *() { return this->value; }
+  explicit operator type &() { return *(this->value); }
+  explicit operator holder_type *() { return std::addressof(holder); }
 
-    // Workaround for Intel compiler bug
-    // see pybind11 issue 94
-    #if defined(__ICC) || defined(__INTEL_COMPILER)
-    operator holder_type&() { return holder; }
-    #else
-    explicit operator holder_type&() { return holder; }
-    #endif
+// Workaround for Intel compiler bug
+// see pybind11 issue 94
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+  operator holder_type &() { return holder; }
+#else
+  explicit operator holder_type &() { return holder; }
+#endif
 
-    static handle cast(const holder_type &src, return_value_policy, handle) {
-        const auto *ptr = holder_helper<holder_type>::get(src);
-        return type_caster_base<type>::cast_holder(ptr, &src);
-    }
+  static handle cast(const holder_type &src, return_value_policy, handle) {
+    const auto *ptr = holder_helper<holder_type>::get(src);
+    return type_caster_base<type>::cast_holder(ptr, &src);
+  }
 
 protected:
-    friend class type_caster_generic;
-    void check_holder_compat() {
-        if (typeinfo->default_holder)
-            throw cast_error("Unable to load a custom holder type from a default-holder instance");
-    }
+  friend class type_caster_generic;
+  void check_holder_compat() {
+    if (typeinfo->default_holder)
+      throw cast_error(
+          "Unable to load a custom holder type from a default-holder instance");
+  }
 
-    bool load_value(value_and_holder &&v_h) {
-        if (v_h.holder_constructed()) {
-            value = v_h.value_ptr();
-            holder = v_h.template holder<holder_type>();
-            return true;
-        } else {
-            throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
+  bool load_value(value_and_holder &&v_h) {
+    if (v_h.holder_constructed()) {
+      value = v_h.value_ptr();
+      holder = v_h.template holder<holder_type>();
+      return true;
+    } else {
+      throw cast_error(
+          "Unable to cast from non-held to held instance (T& to Holder<T>) "
 #if defined(NDEBUG)
-                             "(compile in debug mode for type information)");
+          "(compile in debug mode for type information)");
 #else
-                             "of type '" + type_id<holder_type>() + "''");
+          "of type '" +
+          type_id<holder_type>() + "''");
 #endif
-        }
     }
+  }
 
-    template <typename T = holder_type, detail::enable_if_t<!std::is_constructible<T, const T &, type*>::value, int> = 0>
-    bool try_implicit_casts(handle, bool) { return false; }
+  template <typename T = holder_type,
+            detail::enable_if_t<
+                !std::is_constructible<T, const T &, type *>::value, int> = 0>
+  bool try_implicit_casts(handle, bool) {
+    return false;
+  }
 
-    template <typename T = holder_type, detail::enable_if_t<std::is_constructible<T, const T &, type*>::value, int> = 0>
-    bool try_implicit_casts(handle src, bool convert) {
-        for (auto &cast : typeinfo->implicit_casts) {
-            copyable_holder_caster sub_caster(*cast.first);
-            if (sub_caster.load(src, convert)) {
-                value = cast.second(sub_caster.value);
-                holder = holder_type(sub_caster.holder, (type *) value);
-                return true;
-            }
-        }
-        return false;
+  template <typename T = holder_type,
+            detail::enable_if_t<
+                std::is_constructible<T, const T &, type *>::value, int> = 0>
+  bool try_implicit_casts(handle src, bool convert) {
+    for (auto &cast : typeinfo->implicit_casts) {
+      copyable_holder_caster sub_caster(*cast.first);
+      if (sub_caster.load(src, convert)) {
+        value = cast.second(sub_caster.value);
+        holder = holder_type(sub_caster.holder, (type *)value);
+        return true;
+      }
     }
+    return false;
+  }
 
-    static bool try_direct_conversions(handle) { return false; }
+  static bool try_direct_conversions(handle) { return false; }
 
-
-    holder_type holder;
+  holder_type holder;
 };
 
 /// Specialize for the common std::shared_ptr, so users don't need to
 template <typename T>
-class type_caster<std::shared_ptr<T>> : public copyable_holder_caster<T, std::shared_ptr<T>> { };
+class type_caster<std::shared_ptr<T>>
+    : public copyable_holder_caster<T, std::shared_ptr<T>> {};
 
-template <typename type, typename holder_type>
-struct move_only_holder_caster {
-    static_assert(std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
-            "Holder classes are only supported for custom types");
+template <typename type, typename holder_type> struct move_only_holder_caster {
+  static_assert(
+      std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
+      "Holder classes are only supported for custom types");
 
-    static handle cast(holder_type &&src, return_value_policy, handle) {
-        auto *ptr = holder_helper<holder_type>::get(src);
-        return type_caster_base<type>::cast_holder(ptr, std::addressof(src));
-    }
-    static constexpr auto name = type_caster_base<type>::name;
+  static handle cast(holder_type &&src, return_value_policy, handle) {
+    auto *ptr = holder_helper<holder_type>::get(src);
+    return type_caster_base<type>::cast_holder(ptr, std::addressof(src));
+  }
+  static constexpr auto name = type_caster_base<type>::name;
 };
 
 template <typename type, typename deleter>
 class type_caster<std::unique_ptr<type, deleter>>
-    : public move_only_holder_caster<type, std::unique_ptr<type, deleter>> { };
+    : public move_only_holder_caster<type, std::unique_ptr<type, deleter>> {};
 
 template <typename type, typename holder_type>
-using type_caster_holder = conditional_t<is_copy_constructible<holder_type>::value,
-                                         copyable_holder_caster<type, holder_type>,
-                                         move_only_holder_caster<type, holder_type>>;
+using type_caster_holder =
+    conditional_t<is_copy_constructible<holder_type>::value,
+                  copyable_holder_caster<type, holder_type>,
+                  move_only_holder_caster<type, holder_type>>;
 
-template <typename T, bool Value = false> struct always_construct_holder { static constexpr bool value = Value; };
+template <typename T, bool Value = false> struct always_construct_holder {
+  static constexpr bool value = Value;
+};
 
-/// Create a specialization for custom holder types (silently ignores std::shared_ptr)
-#define PYBIND11_DECLARE_HOLDER_TYPE(type, holder_type, ...) \
-    namespace pybind11 { namespace detail { \
-    template <typename type> \
-    struct always_construct_holder<holder_type> : always_construct_holder<void, ##__VA_ARGS__>  { }; \
-    template <typename type> \
-    class type_caster<holder_type, enable_if_t<!is_shared_ptr<holder_type>::value>> \
-        : public type_caster_holder<type, holder_type> { }; \
-    }}
+/// Create a specialization for custom holder types (silently ignores
+/// std::shared_ptr)
+#define PYBIND11_DECLARE_HOLDER_TYPE(type, holder_type, ...)                   \
+  namespace pybind11 {                                                         \
+  namespace detail {                                                           \
+  template <typename type>                                                     \
+  struct always_construct_holder<holder_type>                                  \
+      : always_construct_holder<void, ##__VA_ARGS__> {};                       \
+  template <typename type>                                                     \
+  class type_caster<holder_type,                                               \
+                    enable_if_t<!is_shared_ptr<holder_type>::value>>           \
+      : public type_caster_holder<type, holder_type> {};                       \
+  }                                                                            \
+  }
 
 // PYBIND11_DECLARE_HOLDER_TYPE holder types:
-template <typename base, typename holder> struct is_holder_type :
-    std::is_base_of<detail::type_caster_holder<base, holder>, detail::type_caster<holder>> {};
+template <typename base, typename holder>
+struct is_holder_type
+    : std::is_base_of<detail::type_caster_holder<base, holder>,
+                      detail::type_caster<holder>> {};
 // Specialization for always-supported unique_ptr holders:
-template <typename base, typename deleter> struct is_holder_type<base, std::unique_ptr<base, deleter>> :
-    std::true_type {};
+template <typename base, typename deleter>
+struct is_holder_type<base, std::unique_ptr<base, deleter>> : std::true_type {};
 
-template <typename T> struct handle_type_name { static constexpr auto name = _<T>(); };
-template <> struct handle_type_name<bytes> { static constexpr auto name = _(PYBIND11_BYTES_NAME); };
-template <> struct handle_type_name<args> { static constexpr auto name = _("*args"); };
-template <> struct handle_type_name<kwargs> { static constexpr auto name = _("**kwargs"); };
+template <typename T> struct handle_type_name {
+  static constexpr auto name = _<T>();
+};
+template <> struct handle_type_name<bytes> {
+  static constexpr auto name = _(PYBIND11_BYTES_NAME);
+};
+template <> struct handle_type_name<args> {
+  static constexpr auto name = _("*args");
+};
+template <> struct handle_type_name<kwargs> {
+  static constexpr auto name = _("**kwargs");
+};
 
-template <typename type>
-struct pyobject_caster {
-    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
-    bool load(handle src, bool /* convert */) { value = src; return static_cast<bool>(value); }
+template <typename type> struct pyobject_caster {
+  template <typename T = type,
+            enable_if_t<std::is_same<T, handle>::value, int> = 0>
+  bool load(handle src, bool /* convert */) {
+    value = src;
+    return static_cast<bool>(value);
+  }
 
-    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
-    bool load(handle src, bool /* convert */) {
-        if (!isinstance<type>(src))
-            return false;
-        value = reinterpret_borrow<type>(src);
-        return true;
-    }
+  template <typename T = type,
+            enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+  bool load(handle src, bool /* convert */) {
+    if (!isinstance<type>(src))
+      return false;
+    value = reinterpret_borrow<type>(src);
+    return true;
+  }
 
-    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
-        return src.inc_ref();
-    }
-    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+  static handle cast(const handle &src, return_value_policy /* policy */,
+                     handle /* parent */) {
+    return src.inc_ref();
+  }
+  PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
 };
 
 template <typename T>
-class type_caster<T, enable_if_t<is_pyobject<T>::value>> : public pyobject_caster<T> { };
+class type_caster<T, enable_if_t<is_pyobject<T>::value>>
+    : public pyobject_caster<T> {};
 
 // Our conditions for enabling moving are quite restrictive:
 // At compile time:
@@ -1583,138 +1808,181 @@ class type_caster<T, enable_if_t<is_pyobject<T>::value>> : public pyobject_caste
 // - type_caster<T>::operator T&() must exist
 // - the type must be move constructible (obviously)
 // At run-time:
-// - if the type is non-copy-constructible, the object must be the sole owner of the type (i.e. it
+// - if the type is non-copy-constructible, the object must be the sole owner of
+// the type (i.e. it
 //   must have ref_count() == 1)h
 // If any of the above are not satisfied, we fall back to copying.
-template <typename T> using move_is_plain_type = satisfies_none_of<T,
-    std::is_void, std::is_pointer, std::is_reference, std::is_const
->;
-template <typename T, typename SFINAE = void> struct move_always : std::false_type {};
-template <typename T> struct move_always<T, enable_if_t<all_of<
-    move_is_plain_type<T>,
-    negation<is_copy_constructible<T>>,
-    std::is_move_constructible<T>,
-    std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
->::value>> : std::true_type {};
-template <typename T, typename SFINAE = void> struct move_if_unreferenced : std::false_type {};
-template <typename T> struct move_if_unreferenced<T, enable_if_t<all_of<
-    move_is_plain_type<T>,
-    negation<move_always<T>>,
-    std::is_move_constructible<T>,
-    std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
->::value>> : std::true_type {};
-template <typename T> using move_never = none_of<move_always<T>, move_if_unreferenced<T>>;
+template <typename T>
+using move_is_plain_type = satisfies_none_of<T, std::is_void, std::is_pointer,
+                                             std::is_reference, std::is_const>;
+template <typename T, typename SFINAE = void>
+struct move_always : std::false_type {};
+template <typename T>
+struct move_always<
+    T, enable_if_t<all_of<
+           move_is_plain_type<T>, negation<is_copy_constructible<T>>,
+           std::is_move_constructible<T>,
+           std::is_same<decltype(std::declval<make_caster<T>>().operator T &()),
+                        T &>>::value>> : std::true_type {};
+template <typename T, typename SFINAE = void>
+struct move_if_unreferenced : std::false_type {};
+template <typename T>
+struct move_if_unreferenced<
+    T, enable_if_t<all_of<
+           move_is_plain_type<T>, negation<move_always<T>>,
+           std::is_move_constructible<T>,
+           std::is_same<decltype(std::declval<make_caster<T>>().operator T &()),
+                        T &>>::value>> : std::true_type {};
+template <typename T>
+using move_never = none_of<move_always<T>, move_if_unreferenced<T>>;
 
-// Detect whether returning a `type` from a cast on type's type_caster is going to result in a
-// reference or pointer to a local variable of the type_caster.  Basically, only
-// non-reference/pointer `type`s and reference/pointers from a type_caster_generic are safe;
-// everything else returns a reference/pointer to a local variable.
-template <typename type> using cast_is_temporary_value_reference = bool_constant<
+// Detect whether returning a `type` from a cast on type's type_caster is going
+// to result in a reference or pointer to a local variable of the type_caster.
+// Basically, only non-reference/pointer `type`s and reference/pointers from a
+// type_caster_generic are safe; everything else returns a reference/pointer to
+// a local variable.
+template <typename type>
+using cast_is_temporary_value_reference = bool_constant<
     (std::is_reference<type>::value || std::is_pointer<type>::value) &&
     !std::is_base_of<type_caster_generic, make_caster<type>>::value &&
-    !std::is_same<intrinsic_t<type>, void>::value
->;
+    !std::is_same<intrinsic_t<type>, void>::value>;
 
-// When a value returned from a C++ function is being cast back to Python, we almost always want to
-// force `policy = move`, regardless of the return value policy the function/method was declared
-// with.
-template <typename Return, typename SFINAE = void> struct return_value_policy_override {
-    static return_value_policy policy(return_value_policy p) { return p; }
+// When a value returned from a C++ function is being cast back to Python, we
+// almost always want to force `policy = move`, regardless of the return value
+// policy the function/method was declared with.
+template <typename Return, typename SFINAE = void>
+struct return_value_policy_override {
+  static return_value_policy policy(return_value_policy p) { return p; }
 };
 
-template <typename Return> struct return_value_policy_override<Return,
-        detail::enable_if_t<std::is_base_of<type_caster_generic, make_caster<Return>>::value, void>> {
-    static return_value_policy policy(return_value_policy p) {
-        return !std::is_lvalue_reference<Return>::value &&
-               !std::is_pointer<Return>::value
-                   ? return_value_policy::move : p;
-    }
+template <typename Return>
+struct return_value_policy_override<
+    Return, detail::enable_if_t<std::is_base_of<type_caster_generic,
+                                                make_caster<Return>>::value,
+                                void>> {
+  static return_value_policy policy(return_value_policy p) {
+    return !std::is_lvalue_reference<Return>::value &&
+                   !std::is_pointer<Return>::value
+               ? return_value_policy::move
+               : p;
+  }
 };
 
 // Basic python -> C++ casting; throws if casting fails
-template <typename T, typename SFINAE> type_caster<T, SFINAE> &load_type(type_caster<T, SFINAE> &conv, const handle &handle) {
-    if (!conv.load(handle, true)) {
+template <typename T, typename SFINAE>
+type_caster<T, SFINAE> &load_type(type_caster<T, SFINAE> &conv,
+                                  const handle &handle) {
+  if (!conv.load(handle, true)) {
 #if defined(NDEBUG)
-        throw cast_error("Unable to cast Python instance to C++ type (compile in debug mode for details)");
+    throw cast_error("Unable to cast Python instance to C++ type (compile in "
+                     "debug mode for details)");
 #else
-        throw cast_error("Unable to cast Python instance of type " +
-            (std::string) str(handle.get_type()) + " to C++ type '" + type_id<T>() + "'");
+    throw cast_error("Unable to cast Python instance of type " +
+                     (std::string)str(handle.get_type()) + " to C++ type '" +
+                     type_id<T>() + "'");
 #endif
-    }
-    return conv;
+  }
+  return conv;
 }
 // Wrapper around the above that also constructs and returns a type_caster
 template <typename T> make_caster<T> load_type(const handle &handle) {
-    make_caster<T> conv;
-    load_type(conv, handle);
-    return conv;
+  make_caster<T> conv;
+  load_type(conv, handle);
+  return conv;
 }
 
 NAMESPACE_END(detail)
 
 // pytype -> C++ type
-template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+template <typename T,
+          detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
 T cast(const handle &handle) {
-    using namespace detail;
-    static_assert(!cast_is_temporary_value_reference<T>::value,
-            "Unable to cast type to reference: value is local to type caster");
-    return cast_op<T>(load_type<T>(handle));
+  using namespace detail;
+  static_assert(
+      !cast_is_temporary_value_reference<T>::value,
+      "Unable to cast type to reference: value is local to type caster");
+  return cast_op<T>(load_type<T>(handle));
 }
 
 // pytype -> pytype (calls converting constructor)
-template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
-T cast(const handle &handle) { return T(reinterpret_borrow<object>(handle)); }
-
-// C++ type -> py::object
-template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
-object cast(const T &value, return_value_policy policy = return_value_policy::automatic_reference,
-            handle parent = handle()) {
-    if (policy == return_value_policy::automatic)
-        policy = std::is_pointer<T>::value ? return_value_policy::take_ownership : return_value_policy::copy;
-    else if (policy == return_value_policy::automatic_reference)
-        policy = std::is_pointer<T>::value ? return_value_policy::reference : return_value_policy::copy;
-    return reinterpret_steal<object>(detail::make_caster<T>::cast(value, policy, parent));
+template <typename T,
+          detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) {
+  return T(reinterpret_borrow<object>(handle));
 }
 
-template <typename T> T handle::cast() const { return pybind11::cast<T>(*this); }
+// C++ type -> py::object
+template <typename T,
+          detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+object
+cast(const T &value,
+     return_value_policy policy = return_value_policy::automatic_reference,
+     handle parent = handle()) {
+  if (policy == return_value_policy::automatic)
+    policy = std::is_pointer<T>::value ? return_value_policy::take_ownership
+                                       : return_value_policy::copy;
+  else if (policy == return_value_policy::automatic_reference)
+    policy = std::is_pointer<T>::value ? return_value_policy::reference
+                                       : return_value_policy::copy;
+  return reinterpret_steal<object>(
+      detail::make_caster<T>::cast(value, policy, parent));
+}
+
+template <typename T> T handle::cast() const {
+  return pybind11::cast<T>(*this);
+}
 template <> inline void handle::cast() const { return; }
 
 template <typename T>
 detail::enable_if_t<!detail::move_never<T>::value, T> move(object &&obj) {
-    if (obj.ref_count() > 1)
+  if (obj.ref_count() > 1)
 #if defined(NDEBUG)
-        throw cast_error("Unable to cast Python instance to C++ rvalue: instance has multiple references"
-            " (compile in debug mode for details)");
+    throw cast_error("Unable to cast Python instance to C++ rvalue: instance "
+                     "has multiple references"
+                     " (compile in debug mode for details)");
 #else
-        throw cast_error("Unable to move from Python " + (std::string) str(obj.get_type()) +
-                " instance to C++ " + type_id<T>() + " instance: instance has multiple references");
+    throw cast_error("Unable to move from Python " +
+                     (std::string)str(obj.get_type()) + " instance to C++ " +
+                     type_id<T>() +
+                     " instance: instance has multiple references");
 #endif
 
-    // Move into a temporary and return that, because the reference may be a local value of `conv`
-    T ret = std::move(detail::load_type<T>(obj).operator T&());
-    return ret;
+  // Move into a temporary and return that, because the reference may be a local
+  // value of `conv`
+  T ret = std::move(detail::load_type<T>(obj).operator T &());
+  return ret;
 }
 
-// Calling cast() on an rvalue calls pybind::cast with the object rvalue, which does:
-// - If we have to move (because T has no copy constructor), do it.  This will fail if the moved
+// Calling cast() on an rvalue calls pybind::cast with the object rvalue, which
+// does:
+// - If we have to move (because T has no copy constructor), do it.  This will
+// fail if the moved
 //   object has multiple references, but trying to copy will fail to compile.
 // - If both movable and copyable, check ref count: if 1, move; otherwise copy
 // - Otherwise (not movable), copy.
-template <typename T> detail::enable_if_t<detail::move_always<T>::value, T> cast(object &&object) {
+template <typename T>
+detail::enable_if_t<detail::move_always<T>::value, T> cast(object &&object) {
+  return move<T>(std::move(object));
+}
+template <typename T>
+detail::enable_if_t<detail::move_if_unreferenced<T>::value, T>
+cast(object &&object) {
+  if (object.ref_count() > 1)
+    return cast<T>(object);
+  else
     return move<T>(std::move(object));
 }
-template <typename T> detail::enable_if_t<detail::move_if_unreferenced<T>::value, T> cast(object &&object) {
-    if (object.ref_count() > 1)
-        return cast<T>(object);
-    else
-        return move<T>(std::move(object));
-}
-template <typename T> detail::enable_if_t<detail::move_never<T>::value, T> cast(object &&object) {
-    return cast<T>(object);
+template <typename T>
+detail::enable_if_t<detail::move_never<T>::value, T> cast(object &&object) {
+  return cast<T>(object);
 }
 
-template <typename T> T object::cast() const & { return pybind11::cast<T>(*this); }
-template <typename T> T object::cast() && { return pybind11::cast<T>(std::move(*this)); }
+template <typename T> T object::cast() const & {
+  return pybind11::cast<T>(*this);
+}
+template <typename T> T object::cast() && {
+  return pybind11::cast<T>(std::move(*this));
+}
 template <> inline void object::cast() const & { return; }
 template <> inline void object::cast() && { return; }
 
@@ -1722,121 +1990,161 @@ NAMESPACE_BEGIN(detail)
 
 // Declared in pytypes.h:
 template <typename T, enable_if_t<!is_pyobject<T>::value, int>>
-object object_or_cast(T &&o) { return pybind11::cast(std::forward<T>(o)); }
-
-struct overload_unused {}; // Placeholder type for the unneeded (and dead code) static variable in the OVERLOAD_INT macro
-template <typename ret_type> using overload_caster_t = conditional_t<
-    cast_is_temporary_value_reference<ret_type>::value, make_caster<ret_type>, overload_unused>;
-
-// Trampoline use: for reference/pointer types to value-converted values, we do a value cast, then
-// store the result in the given variable.  For other types, this is a no-op.
-template <typename T> enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&o, make_caster<T> &caster) {
-    return cast_op<T>(load_type(caster, o));
+object object_or_cast(T &&o) {
+  return pybind11::cast(std::forward<T>(o));
 }
-template <typename T> enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&, overload_unused &) {
-    pybind11_fail("Internal error: cast_ref fallback invoked"); }
 
-// Trampoline use: Having a pybind11::cast with an invalid reference type is going to static_assert, even
-// though if it's in dead code, so we provide a "trampoline" to pybind11::cast that only does anything in
-// cases where pybind11::cast is valid.
-template <typename T> enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&o) {
-    return pybind11::cast<T>(std::move(o)); }
-template <typename T> enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&) {
-    pybind11_fail("Internal error: cast_safe fallback invoked"); }
+struct overload_unused {}; // Placeholder type for the unneeded (and dead code)
+                           // static variable in the OVERLOAD_INT macro
+template <typename ret_type>
+using overload_caster_t =
+    conditional_t<cast_is_temporary_value_reference<ret_type>::value,
+                  make_caster<ret_type>, overload_unused>;
+
+// Trampoline use: for reference/pointer types to value-converted values, we do
+// a value cast, then store the result in the given variable.  For other types,
+// this is a no-op.
+template <typename T>
+enable_if_t<cast_is_temporary_value_reference<T>::value, T>
+cast_ref(object &&o, make_caster<T> &caster) {
+  return cast_op<T>(load_type(caster, o));
+}
+template <typename T>
+enable_if_t<!cast_is_temporary_value_reference<T>::value, T>
+cast_ref(object &&, overload_unused &) {
+  pybind11_fail("Internal error: cast_ref fallback invoked");
+}
+
+// Trampoline use: Having a pybind11::cast with an invalid reference type is
+// going to static_assert, even though if it's in dead code, so we provide a
+// "trampoline" to pybind11::cast that only does anything in cases where
+// pybind11::cast is valid.
+template <typename T>
+enable_if_t<!cast_is_temporary_value_reference<T>::value, T>
+cast_safe(object &&o) {
+  return pybind11::cast<T>(std::move(o));
+}
+template <typename T>
+enable_if_t<cast_is_temporary_value_reference<T>::value, T>
+cast_safe(object &&) {
+  pybind11_fail("Internal error: cast_safe fallback invoked");
+}
 template <> inline void cast_safe<void>(object &&) {}
 
 NAMESPACE_END(detail)
 
 template <return_value_policy policy = return_value_policy::automatic_reference>
-tuple make_tuple() { return tuple(0); }
+tuple make_tuple() {
+  return tuple(0);
+}
 
 template <return_value_policy policy = return_value_policy::automatic_reference,
-          typename... Args> tuple make_tuple(Args&&... args_) {
-    constexpr size_t size = sizeof...(Args);
-    std::array<object, size> args {
-        { reinterpret_steal<object>(detail::make_caster<Args>::cast(
-            std::forward<Args>(args_), policy, nullptr))... }
-    };
-    for (size_t i = 0; i < args.size(); i++) {
-        if (!args[i]) {
+          typename... Args>
+tuple make_tuple(Args &&... args_) {
+  constexpr size_t size = sizeof...(Args);
+  std::array<object, size> args{
+      {reinterpret_steal<object>(detail::make_caster<Args>::cast(
+          std::forward<Args>(args_), policy, nullptr))...}};
+  for (size_t i = 0; i < args.size(); i++) {
+    if (!args[i]) {
 #if defined(NDEBUG)
-            throw cast_error("make_tuple(): unable to convert arguments to Python object (compile in debug mode for details)");
+      throw cast_error("make_tuple(): unable to convert arguments to Python "
+                       "object (compile in debug mode for details)");
 #else
-            std::array<std::string, size> argtypes { {type_id<Args>()...} };
-            throw cast_error("make_tuple(): unable to convert argument of type '" +
-                argtypes[i] + "' to Python object");
+      std::array<std::string, size> argtypes{{type_id<Args>()...}};
+      throw cast_error("make_tuple(): unable to convert argument of type '" +
+                       argtypes[i] + "' to Python object");
 #endif
-        }
     }
-    tuple result(size);
-    int counter = 0;
-    for (auto &arg_value : args)
-        PyTuple_SET_ITEM(result.ptr(), counter++, arg_value.release().ptr());
-    return result;
+  }
+  tuple result(size);
+  int counter = 0;
+  for (auto &arg_value : args)
+    PyTuple_SET_ITEM(result.ptr(), counter++, arg_value.release().ptr());
+  return result;
 }
 
 /// \ingroup annotations
 /// Annotation for arguments
 struct arg {
-    /// Constructs an argument with the name of the argument; if null or omitted, this is a positional argument.
-    constexpr explicit arg(const char *name = nullptr) : name(name), flag_noconvert(false), flag_none(true) { }
-    /// Assign a value to this argument
-    template <typename T> arg_v operator=(T &&value) const;
-    /// Indicate that the type should not be converted in the type caster
-    arg &noconvert(bool flag = true) { flag_noconvert = flag; return *this; }
-    /// Indicates that the argument should/shouldn't allow None (e.g. for nullable pointer args)
-    arg &none(bool flag = true) { flag_none = flag; return *this; }
+  /// Constructs an argument with the name of the argument; if null or omitted,
+  /// this is a positional argument.
+  constexpr explicit arg(const char *name = nullptr)
+      : name(name), flag_noconvert(false), flag_none(true) {}
+  /// Assign a value to this argument
+  template <typename T> arg_v operator=(T &&value) const;
+  /// Indicate that the type should not be converted in the type caster
+  arg &noconvert(bool flag = true) {
+    flag_noconvert = flag;
+    return *this;
+  }
+  /// Indicates that the argument should/shouldn't allow None (e.g. for nullable
+  /// pointer args)
+  arg &none(bool flag = true) {
+    flag_none = flag;
+    return *this;
+  }
 
-    const char *name; ///< If non-null, this is a named kwargs argument
-    bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a supporting type caster!)
-    bool flag_none : 1; ///< If set (the default), allow None to be passed to this argument
+  const char *name;        ///< If non-null, this is a named kwargs argument
+  bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a
+                           ///< supporting type caster!)
+  bool flag_none : 1; ///< If set (the default), allow None to be passed to this
+                      ///< argument
 };
 
 /// \ingroup annotations
 /// Annotation for arguments with values
 struct arg_v : arg {
 private:
-    template <typename T>
-    arg_v(arg &&base, T &&x, const char *descr = nullptr)
-        : arg(base),
-          value(reinterpret_steal<object>(
-              detail::make_caster<T>::cast(x, return_value_policy::automatic, {})
-          )),
-          descr(descr)
+  template <typename T>
+  arg_v(arg &&base, T &&x, const char *descr = nullptr)
+      : arg(base), value(reinterpret_steal<object>(detail::make_caster<T>::cast(
+                       x, return_value_policy::automatic, {}))),
+        descr(descr)
 #if !defined(NDEBUG)
-        , type(type_id<T>())
+        ,
+        type(type_id<T>())
 #endif
-    { }
+  {
+  }
 
 public:
-    /// Direct construction with name, default, and description
-    template <typename T>
-    arg_v(const char *name, T &&x, const char *descr = nullptr)
-        : arg_v(arg(name), std::forward<T>(x), descr) { }
+  /// Direct construction with name, default, and description
+  template <typename T>
+  arg_v(const char *name, T &&x, const char *descr = nullptr)
+      : arg_v(arg(name), std::forward<T>(x), descr) {}
 
-    /// Called internally when invoking `py::arg("a") = value`
-    template <typename T>
-    arg_v(const arg &base, T &&x, const char *descr = nullptr)
-        : arg_v(arg(base), std::forward<T>(x), descr) { }
+  /// Called internally when invoking `py::arg("a") = value`
+  template <typename T>
+  arg_v(const arg &base, T &&x, const char *descr = nullptr)
+      : arg_v(arg(base), std::forward<T>(x), descr) {}
 
-    /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg&
-    arg_v &noconvert(bool flag = true) { arg::noconvert(flag); return *this; }
+  /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg&
+  arg_v &noconvert(bool flag = true) {
+    arg::noconvert(flag);
+    return *this;
+  }
 
-    /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg&
-    arg_v &none(bool flag = true) { arg::none(flag); return *this; }
+  /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg&
+  arg_v &none(bool flag = true) {
+    arg::none(flag);
+    return *this;
+  }
 
-    /// The default value
-    object value;
-    /// The (optional) description of the default value
-    const char *descr;
+  /// The default value
+  object value;
+  /// The (optional) description of the default value
+  const char *descr;
 #if !defined(NDEBUG)
-    /// The C++ type name of the default value (only available when compiled in debug mode)
-    std::string type;
+  /// The C++ type name of the default value (only available when compiled in
+  /// debug mode)
+  std::string type;
 #endif
 };
 
-template <typename T>
-arg_v arg::operator=(T &&value) const { return {std::move(*this), std::forward<T>(value)}; }
+template <typename T> arg_v arg::operator=(T &&value) const {
+  return {std::move(*this), std::forward<T>(value)};
+}
 
 /// Alias for backward compatibility -- to be removed in version 2.0
 template <typename /*unused*/> using arg_t = arg_v;
@@ -1846,7 +2154,7 @@ inline namespace literals {
     String literal version of `arg`
  \endrst */
 constexpr arg operator"" _a(const char *name, size_t) { return arg(name); }
-}
+} // namespace literals
 
 NAMESPACE_BEGIN(detail)
 
@@ -1855,274 +2163,298 @@ struct function_record;
 
 /// Internal data associated with a single function call
 struct function_call {
-    function_call(const function_record &f, handle p); // Implementation in attr.h
+  function_call(const function_record &f, handle p); // Implementation in attr.h
 
-    /// The function data:
-    const function_record &func;
+  /// The function data:
+  const function_record &func;
 
-    /// Arguments passed to the function:
-    std::vector<handle> args;
+  /// Arguments passed to the function:
+  std::vector<handle> args;
 
-    /// The `convert` value the arguments should be loaded with
-    std::vector<bool> args_convert;
+  /// The `convert` value the arguments should be loaded with
+  std::vector<bool> args_convert;
 
-    /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
-    /// present, are also in `args` but without a reference).
-    object args_ref, kwargs_ref;
+  /// Extra references for the optional `py::args` and/or `py::kwargs` arguments
+  /// (which, if present, are also in `args` but without a reference).
+  object args_ref, kwargs_ref;
 
-    /// The parent, if any
-    handle parent;
+  /// The parent, if any
+  handle parent;
 
-    /// If this is a call to an initializer, this argument contains `self`
-    handle init_self;
+  /// If this is a call to an initializer, this argument contains `self`
+  handle init_self;
 };
 
-
 /// Helper class which loads arguments for C++ functions called from Python
-template <typename... Args>
-class argument_loader {
-    using indices = make_index_sequence<sizeof...(Args)>;
+template <typename... Args> class argument_loader {
+  using indices = make_index_sequence<sizeof...(Args)>;
 
-    template <typename Arg> using argument_is_args   = std::is_same<intrinsic_t<Arg>, args>;
-    template <typename Arg> using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
-    // Get args/kwargs argument positions relative to the end of the argument list:
-    static constexpr auto args_pos = constexpr_first<argument_is_args, Args...>() - (int) sizeof...(Args),
-                        kwargs_pos = constexpr_first<argument_is_kwargs, Args...>() - (int) sizeof...(Args);
+  template <typename Arg>
+  using argument_is_args = std::is_same<intrinsic_t<Arg>, args>;
+  template <typename Arg>
+  using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
+  // Get args/kwargs argument positions relative to the end of the argument
+  // list:
+  static constexpr auto args_pos =
+                            constexpr_first<argument_is_args, Args...>() -
+                            (int)sizeof...(Args),
+                        kwargs_pos =
+                            constexpr_first<argument_is_kwargs, Args...>() -
+                            (int)sizeof...(Args);
 
-    static constexpr bool args_kwargs_are_last = kwargs_pos >= - 1 && args_pos >= kwargs_pos - 1;
+  static constexpr bool args_kwargs_are_last =
+      kwargs_pos >= -1 && args_pos >= kwargs_pos - 1;
 
-    static_assert(args_kwargs_are_last, "py::args/py::kwargs are only permitted as the last argument(s) of a function");
+  static_assert(args_kwargs_are_last, "py::args/py::kwargs are only permitted "
+                                      "as the last argument(s) of a function");
 
 public:
-    static constexpr bool has_kwargs = kwargs_pos < 0;
-    static constexpr bool has_args = args_pos < 0;
+  static constexpr bool has_kwargs = kwargs_pos < 0;
+  static constexpr bool has_args = args_pos < 0;
 
-    static constexpr auto arg_names = concat(type_descr(make_caster<Args>::name)...);
+  static constexpr auto arg_names =
+      concat(type_descr(make_caster<Args>::name)...);
 
-    bool load_args(function_call &call) {
-        return load_impl_sequence(call, indices{});
-    }
+  bool load_args(function_call &call) {
+    return load_impl_sequence(call, indices{});
+  }
 
-    template <typename Return, typename Guard, typename Func>
-    enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
-        return std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
-    }
+  template <typename Return, typename Guard, typename Func>
+  enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
+    return std::move(*this).template call_impl<Return>(std::forward<Func>(f),
+                                                       indices{}, Guard{});
+  }
 
-    template <typename Return, typename Guard, typename Func>
-    enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
-        std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
-        return void_type();
-    }
+  template <typename Return, typename Guard, typename Func>
+  enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
+    std::move(*this).template call_impl<Return>(std::forward<Func>(f),
+                                                indices{}, Guard{});
+    return void_type();
+  }
 
 private:
+  static bool load_impl_sequence(function_call &, index_sequence<>) {
+    return true;
+  }
 
-    static bool load_impl_sequence(function_call &, index_sequence<>) { return true; }
+  template <size_t... Is>
+  bool load_impl_sequence(function_call &call, index_sequence<Is...>) {
+    for (bool r : {std::get<Is>(argcasters)
+                       .load(call.args[Is], call.args_convert[Is])...})
+      if (!r)
+        return false;
+    return true;
+  }
 
-    template <size_t... Is>
-    bool load_impl_sequence(function_call &call, index_sequence<Is...>) {
-        for (bool r : {std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])...})
-            if (!r)
-                return false;
-        return true;
-    }
+  template <typename Return, typename Func, size_t... Is, typename Guard>
+  Return call_impl(Func &&f, index_sequence<Is...>, Guard &&) {
+    return std::forward<Func>(f)(
+        cast_op<Args>(std::move(std::get<Is>(argcasters)))...);
+  }
 
-    template <typename Return, typename Func, size_t... Is, typename Guard>
-    Return call_impl(Func &&f, index_sequence<Is...>, Guard &&) {
-        return std::forward<Func>(f)(cast_op<Args>(std::move(std::get<Is>(argcasters)))...);
-    }
-
-    std::tuple<make_caster<Args>...> argcasters;
+  std::tuple<make_caster<Args>...> argcasters;
 };
 
-/// Helper class which collects only positional arguments for a Python function call.
-/// A fancier version below can collect any argument, but this one is optimal for simple calls.
-template <return_value_policy policy>
-class simple_collector {
+/// Helper class which collects only positional arguments for a Python function
+/// call. A fancier version below can collect any argument, but this one is
+/// optimal for simple calls.
+template <return_value_policy policy> class simple_collector {
 public:
-    template <typename... Ts>
-    explicit simple_collector(Ts &&...values)
-        : m_args(pybind11::make_tuple<policy>(std::forward<Ts>(values)...)) { }
+  template <typename... Ts>
+  explicit simple_collector(Ts &&... values)
+      : m_args(pybind11::make_tuple<policy>(std::forward<Ts>(values)...)) {}
 
-    const tuple &args() const & { return m_args; }
-    dict kwargs() const { return {}; }
+  const tuple &args() const & { return m_args; }
+  dict kwargs() const { return {}; }
 
-    tuple args() && { return std::move(m_args); }
+  tuple args() && { return std::move(m_args); }
 
-    /// Call a Python function and pass the collected arguments
-    object call(PyObject *ptr) const {
-        PyObject *result = PyObject_CallObject(ptr, m_args.ptr());
-        if (!result)
-            throw error_already_set();
-        return reinterpret_steal<object>(result);
-    }
+  /// Call a Python function and pass the collected arguments
+  object call(PyObject *ptr) const {
+    PyObject *result = PyObject_CallObject(ptr, m_args.ptr());
+    if (!result)
+      throw error_already_set();
+    return reinterpret_steal<object>(result);
+  }
 
 private:
-    tuple m_args;
+  tuple m_args;
 };
 
-/// Helper class which collects positional, keyword, * and ** arguments for a Python function call
-template <return_value_policy policy>
-class unpacking_collector {
+/// Helper class which collects positional, keyword, * and ** arguments for a
+/// Python function call
+template <return_value_policy policy> class unpacking_collector {
 public:
-    template <typename... Ts>
-    explicit unpacking_collector(Ts &&...values) {
-        // Tuples aren't (easily) resizable so a list is needed for collection,
-        // but the actual function call strictly requires a tuple.
-        auto args_list = list();
-        int _[] = { 0, (process(args_list, std::forward<Ts>(values)), 0)... };
-        ignore_unused(_);
+  template <typename... Ts> explicit unpacking_collector(Ts &&... values) {
+    // Tuples aren't (easily) resizable so a list is needed for collection,
+    // but the actual function call strictly requires a tuple.
+    auto args_list = list();
+    int _[] = {0, (process(args_list, std::forward<Ts>(values)), 0)...};
+    ignore_unused(_);
 
-        m_args = std::move(args_list);
-    }
+    m_args = std::move(args_list);
+  }
 
-    const tuple &args() const & { return m_args; }
-    const dict &kwargs() const & { return m_kwargs; }
+  const tuple &args() const & { return m_args; }
+  const dict &kwargs() const & { return m_kwargs; }
 
-    tuple args() && { return std::move(m_args); }
-    dict kwargs() && { return std::move(m_kwargs); }
+  tuple args() && { return std::move(m_args); }
+  dict kwargs() && { return std::move(m_kwargs); }
 
-    /// Call a Python function and pass the collected arguments
-    object call(PyObject *ptr) const {
-        PyObject *result = PyObject_Call(ptr, m_args.ptr(), m_kwargs.ptr());
-        if (!result)
-            throw error_already_set();
-        return reinterpret_steal<object>(result);
-    }
+  /// Call a Python function and pass the collected arguments
+  object call(PyObject *ptr) const {
+    PyObject *result = PyObject_Call(ptr, m_args.ptr(), m_kwargs.ptr());
+    if (!result)
+      throw error_already_set();
+    return reinterpret_steal<object>(result);
+  }
 
 private:
-    template <typename T>
-    void process(list &args_list, T &&x) {
-        auto o = reinterpret_steal<object>(detail::make_caster<T>::cast(std::forward<T>(x), policy, {}));
-        if (!o) {
+  template <typename T> void process(list &args_list, T &&x) {
+    auto o = reinterpret_steal<object>(
+        detail::make_caster<T>::cast(std::forward<T>(x), policy, {}));
+    if (!o) {
 #if defined(NDEBUG)
-            argument_cast_error();
+      argument_cast_error();
 #else
-            argument_cast_error(std::to_string(args_list.size()), type_id<T>());
+      argument_cast_error(std::to_string(args_list.size()), type_id<T>());
 #endif
-        }
-        args_list.append(o);
     }
+    args_list.append(o);
+  }
 
-    void process(list &args_list, detail::args_proxy ap) {
-        for (const auto &a : ap)
-            args_list.append(a);
-    }
+  void process(list &args_list, detail::args_proxy ap) {
+    for (const auto &a : ap)
+      args_list.append(a);
+  }
 
-    void process(list &/*args_list*/, arg_v a) {
-        if (!a.name)
+  void process(list & /*args_list*/, arg_v a) {
+    if (!a.name)
 #if defined(NDEBUG)
-            nameless_argument_error();
+      nameless_argument_error();
 #else
-            nameless_argument_error(a.type);
+      nameless_argument_error(a.type);
 #endif
 
-        if (m_kwargs.contains(a.name)) {
+    if (m_kwargs.contains(a.name)) {
 #if defined(NDEBUG)
-            multiple_values_error();
+      multiple_values_error();
 #else
-            multiple_values_error(a.name);
+      multiple_values_error(a.name);
 #endif
-        }
-        if (!a.value) {
+    }
+    if (!a.value) {
 #if defined(NDEBUG)
-            argument_cast_error();
+      argument_cast_error();
 #else
-            argument_cast_error(a.name, a.type);
+      argument_cast_error(a.name, a.type);
 #endif
-        }
-        m_kwargs[a.name] = a.value;
     }
+    m_kwargs[a.name] = a.value;
+  }
 
-    void process(list &/*args_list*/, detail::kwargs_proxy kp) {
-        if (!kp)
-            return;
-        for (const auto &k : reinterpret_borrow<dict>(kp)) {
-            if (m_kwargs.contains(k.first)) {
+  void process(list & /*args_list*/, detail::kwargs_proxy kp) {
+    if (!kp)
+      return;
+    for (const auto &k : reinterpret_borrow<dict>(kp)) {
+      if (m_kwargs.contains(k.first)) {
 #if defined(NDEBUG)
-                multiple_values_error();
+        multiple_values_error();
 #else
-                multiple_values_error(str(k.first));
+        multiple_values_error(str(k.first));
 #endif
-            }
-            m_kwargs[k.first] = k.second;
-        }
+      }
+      m_kwargs[k.first] = k.second;
     }
+  }
 
-    [[noreturn]] static void nameless_argument_error() {
-        throw type_error("Got kwargs without a name; only named arguments "
-                         "may be passed via py::arg() to a python function call. "
-                         "(compile in debug mode for details)");
-    }
-    [[noreturn]] static void nameless_argument_error(std::string type) {
-        throw type_error("Got kwargs without a name of type '" + type + "'; only named "
-                         "arguments may be passed via py::arg() to a python function call. ");
-    }
-    [[noreturn]] static void multiple_values_error() {
-        throw type_error("Got multiple values for keyword argument "
-                         "(compile in debug mode for details)");
-    }
+  [[noreturn]] static void nameless_argument_error() {
+    throw type_error("Got kwargs without a name; only named arguments "
+                     "may be passed via py::arg() to a python function call. "
+                     "(compile in debug mode for details)");
+  }
+  [[noreturn]] static void nameless_argument_error(std::string type) {
+    throw type_error(
+        "Got kwargs without a name of type '" + type +
+        "'; only named "
+        "arguments may be passed via py::arg() to a python function call. ");
+  }
+  [[noreturn]] static void multiple_values_error() {
+    throw type_error("Got multiple values for keyword argument "
+                     "(compile in debug mode for details)");
+  }
 
-    [[noreturn]] static void multiple_values_error(std::string name) {
-        throw type_error("Got multiple values for keyword argument '" + name + "'");
-    }
+  [[noreturn]] static void multiple_values_error(std::string name) {
+    throw type_error("Got multiple values for keyword argument '" + name + "'");
+  }
 
-    [[noreturn]] static void argument_cast_error() {
-        throw cast_error("Unable to convert call argument to Python object "
-                         "(compile in debug mode for details)");
-    }
+  [[noreturn]] static void argument_cast_error() {
+    throw cast_error("Unable to convert call argument to Python object "
+                     "(compile in debug mode for details)");
+  }
 
-    [[noreturn]] static void argument_cast_error(std::string name, std::string type) {
-        throw cast_error("Unable to convert call argument '" + name
-                         + "' of type '" + type + "' to Python object");
-    }
+  [[noreturn]] static void argument_cast_error(std::string name,
+                                               std::string type) {
+    throw cast_error("Unable to convert call argument '" + name +
+                     "' of type '" + type + "' to Python object");
+  }
 
 private:
-    tuple m_args;
-    dict m_kwargs;
+  tuple m_args;
+  dict m_kwargs;
 };
 
 /// Collect only positional arguments for a Python function call
 template <return_value_policy policy, typename... Args,
           typename = enable_if_t<all_of<is_positional<Args>...>::value>>
-simple_collector<policy> collect_arguments(Args &&...args) {
-    return simple_collector<policy>(std::forward<Args>(args)...);
+simple_collector<policy> collect_arguments(Args &&... args) {
+  return simple_collector<policy>(std::forward<Args>(args)...);
 }
 
-/// Collect all arguments, including keywords and unpacking (only instantiated when needed)
+/// Collect all arguments, including keywords and unpacking (only instantiated
+/// when needed)
 template <return_value_policy policy, typename... Args,
           typename = enable_if_t<!all_of<is_positional<Args>...>::value>>
-unpacking_collector<policy> collect_arguments(Args &&...args) {
-    // Following argument order rules for generalized unpacking according to PEP 448
-    static_assert(
-        constexpr_last<is_positional, Args...>() < constexpr_first<is_keyword_or_ds, Args...>()
-        && constexpr_last<is_s_unpacking, Args...>() < constexpr_first<is_ds_unpacking, Args...>(),
-        "Invalid function call: positional args must precede keywords and ** unpacking; "
-        "* unpacking must precede ** unpacking"
-    );
-    return unpacking_collector<policy>(std::forward<Args>(args)...);
+unpacking_collector<policy> collect_arguments(Args &&... args) {
+  // Following argument order rules for generalized unpacking according to PEP
+  // 448
+  static_assert(constexpr_last<is_positional, Args...>() <
+                        constexpr_first<is_keyword_or_ds, Args...>() &&
+                    constexpr_last<is_s_unpacking, Args...>() <
+                        constexpr_first<is_ds_unpacking, Args...>(),
+                "Invalid function call: positional args must precede keywords "
+                "and ** unpacking; "
+                "* unpacking must precede ** unpacking");
+  return unpacking_collector<policy>(std::forward<Args>(args)...);
 }
 
 template <typename Derived>
 template <return_value_policy policy, typename... Args>
-object object_api<Derived>::operator()(Args &&...args) const {
-    return detail::collect_arguments<policy>(std::forward<Args>(args)...).call(derived().ptr());
+object object_api<Derived>::operator()(Args &&... args) const {
+  return detail::collect_arguments<policy>(std::forward<Args>(args)...)
+      .call(derived().ptr());
 }
 
 template <typename Derived>
 template <return_value_policy policy, typename... Args>
-object object_api<Derived>::call(Args &&...args) const {
-    return operator()<policy>(std::forward<Args>(args)...);
+object object_api<Derived>::call(Args &&... args) const {
+  return operator()<policy>(std::forward<Args>(args)...);
 }
 
 NAMESPACE_END(detail)
 
-#define PYBIND11_MAKE_OPAQUE(...) \
-    namespace pybind11 { namespace detail { \
-        template<> class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> { }; \
-    }}
+#define PYBIND11_MAKE_OPAQUE(...)                                              \
+  namespace pybind11 {                                                         \
+  namespace detail {                                                           \
+  template <>                                                                  \
+  class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> {};    \
+  }                                                                            \
+  }
 
-/// Lets you pass a type containing a `,` through a macro parameter without needing a separate
-/// typedef, e.g.: `PYBIND11_OVERLOAD(PYBIND11_TYPE(ReturnType<A, B>), PYBIND11_TYPE(Parent<C, D>), f, arg)`
+/// Lets you pass a type containing a `,` through a macro parameter without
+/// needing a separate typedef, e.g.:
+/// `PYBIND11_OVERLOAD(PYBIND11_TYPE(ReturnType<A, B>), PYBIND11_TYPE(Parent<C,
+/// D>), f, arg)`
 #define PYBIND11_TYPE(...) __VA_ARGS__
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/chrono.h b/python/src/pybind11/chrono.h
index 95ada76e0..974355152 100644
--- a/python/src/pybind11/chrono.h
+++ b/python/src/pybind11/chrono.h
@@ -1,5 +1,6 @@
 /*
-    pybind11/chrono.h: Transparent conversion between std::chrono and python's datetime
+    pybind11/chrono.h: Transparent conversion between std::chrono and python's
+   datetime
 
     Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
                        Wenzel Jakob <wenzel.jakob@epfl.ch>
@@ -11,20 +12,21 @@
 #pragma once
 
 #include "pybind11.h"
+#include <chrono>
 #include <cmath>
 #include <ctime>
-#include <chrono>
 #include <datetime.h>
 
 // Backport the PyDateTime_DELTA functions from Python3.3 if required
 #ifndef PyDateTime_DELTA_GET_DAYS
-#define PyDateTime_DELTA_GET_DAYS(o)         (((PyDateTime_Delta*)o)->days)
+#define PyDateTime_DELTA_GET_DAYS(o) (((PyDateTime_Delta *)o)->days)
 #endif
 #ifndef PyDateTime_DELTA_GET_SECONDS
-#define PyDateTime_DELTA_GET_SECONDS(o)      (((PyDateTime_Delta*)o)->seconds)
+#define PyDateTime_DELTA_GET_SECONDS(o) (((PyDateTime_Delta *)o)->seconds)
 #endif
 #ifndef PyDateTime_DELTA_GET_MICROSECONDS
-#define PyDateTime_DELTA_GET_MICROSECONDS(o) (((PyDateTime_Delta*)o)->microseconds)
+#define PyDateTime_DELTA_GET_MICROSECONDS(o)                                   \
+  (((PyDateTime_Delta *)o)->microseconds)
 #endif
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
@@ -32,131 +34,154 @@ NAMESPACE_BEGIN(detail)
 
 template <typename type> class duration_caster {
 public:
-    typedef typename type::rep rep;
-    typedef typename type::period period;
+  typedef typename type::rep rep;
+  typedef typename type::period period;
 
-    typedef std::chrono::duration<uint_fast32_t, std::ratio<86400>> days;
+  typedef std::chrono::duration<uint_fast32_t, std::ratio<86400>> days;
 
-    bool load(handle src, bool) {
-        using namespace std::chrono;
+  bool load(handle src, bool) {
+    using namespace std::chrono;
 
-        // Lazy initialise the PyDateTime import
-        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
-
-        if (!src) return false;
-        // If invoked with datetime.delta object
-        if (PyDelta_Check(src.ptr())) {
-            value = type(duration_cast<duration<rep, period>>(
-                  days(PyDateTime_DELTA_GET_DAYS(src.ptr()))
-                + seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr()))
-                + microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr()))));
-            return true;
-        }
-        // If invoked with a float we assume it is seconds and convert
-        else if (PyFloat_Check(src.ptr())) {
-            value = type(duration_cast<duration<rep, period>>(duration<double>(PyFloat_AsDouble(src.ptr()))));
-            return true;
-        }
-        else return false;
+    // Lazy initialise the PyDateTime import
+    if (!PyDateTimeAPI) {
+      PyDateTime_IMPORT;
     }
 
-    // If this is a duration just return it back
-    static const std::chrono::duration<rep, period>& get_duration(const std::chrono::duration<rep, period> &src) {
-        return src;
+    if (!src)
+      return false;
+    // If invoked with datetime.delta object
+    if (PyDelta_Check(src.ptr())) {
+      value = type(duration_cast<duration<rep, period>>(
+          days(PyDateTime_DELTA_GET_DAYS(src.ptr())) +
+          seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr())) +
+          microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr()))));
+      return true;
+    }
+    // If invoked with a float we assume it is seconds and convert
+    else if (PyFloat_Check(src.ptr())) {
+      value = type(duration_cast<duration<rep, period>>(
+          duration<double>(PyFloat_AsDouble(src.ptr()))));
+      return true;
+    } else
+      return false;
+  }
+
+  // If this is a duration just return it back
+  static const std::chrono::duration<rep, period> &
+  get_duration(const std::chrono::duration<rep, period> &src) {
+    return src;
+  }
+
+  // If this is a time_point get the time_since_epoch
+  template <typename Clock>
+  static std::chrono::duration<rep, period> get_duration(
+      const std::chrono::time_point<Clock, std::chrono::duration<rep, period>>
+          &src) {
+    return src.time_since_epoch();
+  }
+
+  static handle cast(const type &src, return_value_policy /* policy */,
+                     handle /* parent */) {
+    using namespace std::chrono;
+
+    // Use overloaded function to get our duration from our source
+    // Works out if it is a duration or time_point and get the duration
+    auto d = get_duration(src);
+
+    // Lazy initialise the PyDateTime import
+    if (!PyDateTimeAPI) {
+      PyDateTime_IMPORT;
     }
 
-    // If this is a time_point get the time_since_epoch
-    template <typename Clock> static std::chrono::duration<rep, period> get_duration(const std::chrono::time_point<Clock, std::chrono::duration<rep, period>> &src) {
-        return src.time_since_epoch();
-    }
+    // Declare these special duration types so the conversions happen with the
+    // correct primitive types (int)
+    using dd_t = duration<int, std::ratio<86400>>;
+    using ss_t = duration<int, std::ratio<1>>;
+    using us_t = duration<int, std::micro>;
 
-    static handle cast(const type &src, return_value_policy /* policy */, handle /* parent */) {
-        using namespace std::chrono;
+    auto dd = duration_cast<dd_t>(d);
+    auto subd = d - dd;
+    auto ss = duration_cast<ss_t>(subd);
+    auto us = duration_cast<us_t>(subd - ss);
+    return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
+  }
 
-        // Use overloaded function to get our duration from our source
-        // Works out if it is a duration or time_point and get the duration
-        auto d = get_duration(src);
-
-        // Lazy initialise the PyDateTime import
-        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
-
-        // Declare these special duration types so the conversions happen with the correct primitive types (int)
-        using dd_t = duration<int, std::ratio<86400>>;
-        using ss_t = duration<int, std::ratio<1>>;
-        using us_t = duration<int, std::micro>;
-
-        auto dd = duration_cast<dd_t>(d);
-        auto subd = d - dd;
-        auto ss = duration_cast<ss_t>(subd);
-        auto us = duration_cast<us_t>(subd - ss);
-        return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
-    }
-
-    PYBIND11_TYPE_CASTER(type, _("datetime.timedelta"));
+  PYBIND11_TYPE_CASTER(type, _("datetime.timedelta"));
 };
 
-// This is for casting times on the system clock into datetime.datetime instances
-template <typename Duration> class type_caster<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+// This is for casting times on the system clock into datetime.datetime
+// instances
+template <typename Duration>
+class type_caster<
+    std::chrono::time_point<std::chrono::system_clock, Duration>> {
 public:
-    typedef std::chrono::time_point<std::chrono::system_clock, Duration> type;
-    bool load(handle src, bool) {
-        using namespace std::chrono;
+  typedef std::chrono::time_point<std::chrono::system_clock, Duration> type;
+  bool load(handle src, bool) {
+    using namespace std::chrono;
 
-        // Lazy initialise the PyDateTime import
-        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
-
-        if (!src) return false;
-        if (PyDateTime_Check(src.ptr())) {
-            std::tm cal;
-            cal.tm_sec   = PyDateTime_DATE_GET_SECOND(src.ptr());
-            cal.tm_min   = PyDateTime_DATE_GET_MINUTE(src.ptr());
-            cal.tm_hour  = PyDateTime_DATE_GET_HOUR(src.ptr());
-            cal.tm_mday  = PyDateTime_GET_DAY(src.ptr());
-            cal.tm_mon   = PyDateTime_GET_MONTH(src.ptr()) - 1;
-            cal.tm_year  = PyDateTime_GET_YEAR(src.ptr()) - 1900;
-            cal.tm_isdst = -1;
-
-            value = system_clock::from_time_t(std::mktime(&cal)) + microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr()));
-            return true;
-        }
-        else return false;
+    // Lazy initialise the PyDateTime import
+    if (!PyDateTimeAPI) {
+      PyDateTime_IMPORT;
     }
 
-    static handle cast(const std::chrono::time_point<std::chrono::system_clock, Duration> &src, return_value_policy /* policy */, handle /* parent */) {
-        using namespace std::chrono;
+    if (!src)
+      return false;
+    if (PyDateTime_Check(src.ptr())) {
+      std::tm cal;
+      cal.tm_sec = PyDateTime_DATE_GET_SECOND(src.ptr());
+      cal.tm_min = PyDateTime_DATE_GET_MINUTE(src.ptr());
+      cal.tm_hour = PyDateTime_DATE_GET_HOUR(src.ptr());
+      cal.tm_mday = PyDateTime_GET_DAY(src.ptr());
+      cal.tm_mon = PyDateTime_GET_MONTH(src.ptr()) - 1;
+      cal.tm_year = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+      cal.tm_isdst = -1;
 
-        // Lazy initialise the PyDateTime import
-        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+      value = system_clock::from_time_t(std::mktime(&cal)) +
+              microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr()));
+      return true;
+    } else
+      return false;
+  }
 
-        std::time_t tt = system_clock::to_time_t(src);
-        // this function uses static memory so it's best to copy it out asap just in case
-        // otherwise other code that is using localtime may break this (not just python code)
-        std::tm localtime = *std::localtime(&tt);
+  static handle
+  cast(const std::chrono::time_point<std::chrono::system_clock, Duration> &src,
+       return_value_policy /* policy */, handle /* parent */) {
+    using namespace std::chrono;
 
-        // Declare these special duration types so the conversions happen with the correct primitive types (int)
-        using us_t = duration<int, std::micro>;
-
-        return PyDateTime_FromDateAndTime(localtime.tm_year + 1900,
-                                          localtime.tm_mon + 1,
-                                          localtime.tm_mday,
-                                          localtime.tm_hour,
-                                          localtime.tm_min,
-                                          localtime.tm_sec,
-                                          (duration_cast<us_t>(src.time_since_epoch() % seconds(1))).count());
+    // Lazy initialise the PyDateTime import
+    if (!PyDateTimeAPI) {
+      PyDateTime_IMPORT;
     }
-    PYBIND11_TYPE_CASTER(type, _("datetime.datetime"));
+
+    std::time_t tt = system_clock::to_time_t(src);
+    // this function uses static memory so it's best to copy it out asap just in
+    // case otherwise other code that is using localtime may break this (not
+    // just python code)
+    std::tm localtime = *std::localtime(&tt);
+
+    // Declare these special duration types so the conversions happen with the
+    // correct primitive types (int)
+    using us_t = duration<int, std::micro>;
+
+    return PyDateTime_FromDateAndTime(
+        localtime.tm_year + 1900, localtime.tm_mon + 1, localtime.tm_mday,
+        localtime.tm_hour, localtime.tm_min, localtime.tm_sec,
+        (duration_cast<us_t>(src.time_since_epoch() % seconds(1))).count());
+  }
+  PYBIND11_TYPE_CASTER(type, _("datetime.datetime"));
 };
 
-// Other clocks that are not the system clock are not measured as datetime.datetime objects
-// since they are not measured on calendar time. So instead we just make them timedeltas
-// Or if they have passed us a time as a float we convert that
-template <typename Clock, typename Duration> class type_caster<std::chrono::time_point<Clock, Duration>>
-: public duration_caster<std::chrono::time_point<Clock, Duration>> {
-};
+// Other clocks that are not the system clock are not measured as
+// datetime.datetime objects since they are not measured on calendar time. So
+// instead we just make them timedeltas Or if they have passed us a time as a
+// float we convert that
+template <typename Clock, typename Duration>
+class type_caster<std::chrono::time_point<Clock, Duration>>
+    : public duration_caster<std::chrono::time_point<Clock, Duration>> {};
 
-template <typename Rep, typename Period> class type_caster<std::chrono::duration<Rep, Period>>
-: public duration_caster<std::chrono::duration<Rep, Period>> {
-};
+template <typename Rep, typename Period>
+class type_caster<std::chrono::duration<Rep, Period>>
+    : public duration_caster<std::chrono::duration<Rep, Period>> {};
 
 NAMESPACE_END(detail)
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/common.h b/python/src/pybind11/common.h
index 6c8a4f1e8..bc10a1675 100644
--- a/python/src/pybind11/common.h
+++ b/python/src/pybind11/common.h
@@ -1,2 +1,3 @@
 #include "detail/common.h"
-#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
+#warning                                                                       \
+    "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
diff --git a/python/src/pybind11/complex.h b/python/src/pybind11/complex.h
index 3f8963857..e4b15624f 100644
--- a/python/src/pybind11/complex.h
+++ b/python/src/pybind11/complex.h
@@ -14,52 +14,59 @@
 
 /// glibc defines I as a macro which breaks things, e.g., boost template names
 #ifdef I
-#  undef I
+#undef I
 #endif
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
-template <typename T> struct format_descriptor<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
-    static constexpr const char c = format_descriptor<T>::c;
-    static constexpr const char value[3] = { 'Z', c, '\0' };
-    static std::string format() { return std::string(value); }
+template <typename T>
+struct format_descriptor<
+    std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+  static constexpr const char c = format_descriptor<T>::c;
+  static constexpr const char value[3] = {'Z', c, '\0'};
+  static std::string format() { return std::string(value); }
 };
 
 #ifndef PYBIND11_CPP17
 
-template <typename T> constexpr const char format_descriptor<
-    std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
+template <typename T>
+constexpr const char format_descriptor<
+    std::complex<T>,
+    detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
 
 #endif
 
 NAMESPACE_BEGIN(detail)
 
-template <typename T> struct is_fmt_numeric<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
-    static constexpr bool value = true;
-    static constexpr int index = is_fmt_numeric<T>::index + 3;
+template <typename T>
+struct is_fmt_numeric<std::complex<T>,
+                      detail::enable_if_t<std::is_floating_point<T>::value>> {
+  static constexpr bool value = true;
+  static constexpr int index = is_fmt_numeric<T>::index + 3;
 };
 
 template <typename T> class type_caster<std::complex<T>> {
 public:
-    bool load(handle src, bool convert) {
-        if (!src)
-            return false;
-        if (!convert && !PyComplex_Check(src.ptr()))
-            return false;
-        Py_complex result = PyComplex_AsCComplex(src.ptr());
-        if (result.real == -1.0 && PyErr_Occurred()) {
-            PyErr_Clear();
-            return false;
-        }
-        value = std::complex<T>((T) result.real, (T) result.imag);
-        return true;
+  bool load(handle src, bool convert) {
+    if (!src)
+      return false;
+    if (!convert && !PyComplex_Check(src.ptr()))
+      return false;
+    Py_complex result = PyComplex_AsCComplex(src.ptr());
+    if (result.real == -1.0 && PyErr_Occurred()) {
+      PyErr_Clear();
+      return false;
     }
+    value = std::complex<T>((T)result.real, (T)result.imag);
+    return true;
+  }
 
-    static handle cast(const std::complex<T> &src, return_value_policy /* policy */, handle /* parent */) {
-        return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
-    }
+  static handle cast(const std::complex<T> &src,
+                     return_value_policy /* policy */, handle /* parent */) {
+    return PyComplex_FromDoubles((double)src.real(), (double)src.imag());
+  }
 
-    PYBIND11_TYPE_CASTER(std::complex<T>, _("complex"));
+  PYBIND11_TYPE_CASTER(std::complex<T>, _("complex"));
 };
 NAMESPACE_END(detail)
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/detail/class.h b/python/src/pybind11/detail/class.h
index b1916fcd0..0ee81aefa 100644
--- a/python/src/pybind11/detail/class.h
+++ b/python/src/pybind11/detail/class.h
@@ -16,76 +16,81 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
 #if PY_VERSION_HEX >= 0x03030000
-#  define PYBIND11_BUILTIN_QUALNAME
-#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
+#define PYBIND11_BUILTIN_QUALNAME
+#define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
 #else
-// In pre-3.3 Python, we still set __qualname__ so that we can produce reliable function type
-// signatures; in 3.3+ this macro expands to nothing:
-#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) setattr((PyObject *) obj, "__qualname__", nameobj)
+// In pre-3.3 Python, we still set __qualname__ so that we can produce reliable
+// function type signatures; in 3.3+ this macro expands to nothing:
+#define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)                              \
+  setattr((PyObject *)obj, "__qualname__", nameobj)
 #endif
 
 inline PyTypeObject *type_incref(PyTypeObject *type) {
-    Py_INCREF(type);
-    return type;
+  Py_INCREF(type);
+  return type;
 }
 
 #if !defined(PYPY_VERSION)
 
-/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
-extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
-    return PyProperty_Type.tp_descr_get(self, cls, cls);
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the
+/// instance.
+extern "C" inline PyObject *
+pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+  return PyProperty_Type.tp_descr_get(self, cls, cls);
 }
 
 /// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
-extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
-    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
-    return PyProperty_Type.tp_descr_set(self, cls, value);
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj,
+                                          PyObject *value) {
+  PyObject *cls = PyType_Check(obj) ? obj : (PyObject *)Py_TYPE(obj);
+  return PyProperty_Type.tp_descr_set(self, cls, value);
 }
 
-/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
-    methods are modified to always use the object type instead of a concrete instance.
-    Return value: New reference. */
+/** A `static_property` is the same as a `property` but the `__get__()` and
+   `__set__()` methods are modified to always use the object type instead of a
+   concrete instance. Return value: New reference. */
 inline PyTypeObject *make_static_property_type() {
-    constexpr auto *name = "pybind11_static_property";
-    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+  constexpr auto *name = "pybind11_static_property";
+  auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
 
-    /* Danger zone: from now (and until PyType_Ready), make sure to
-       issue no Python C API calls which could potentially invoke the
-       garbage collector (the GC will call type_traverse(), which will in
-       turn find the newly constructed type in an invalid state) */
-    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
-    if (!heap_type)
-        pybind11_fail("make_static_property_type(): error allocating type!");
+  /* Danger zone: from now (and until PyType_Ready), make sure to
+     issue no Python C API calls which could potentially invoke the
+     garbage collector (the GC will call type_traverse(), which will in
+     turn find the newly constructed type in an invalid state) */
+  auto heap_type = (PyHeapTypeObject *)PyType_Type.tp_alloc(&PyType_Type, 0);
+  if (!heap_type)
+    pybind11_fail("make_static_property_type(): error allocating type!");
 
-    heap_type->ht_name = name_obj.inc_ref().ptr();
+  heap_type->ht_name = name_obj.inc_ref().ptr();
 #ifdef PYBIND11_BUILTIN_QUALNAME
-    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+  heap_type->ht_qualname = name_obj.inc_ref().ptr();
 #endif
 
-    auto type = &heap_type->ht_type;
-    type->tp_name = name;
-    type->tp_base = type_incref(&PyProperty_Type);
-    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
-    type->tp_descr_get = pybind11_static_get;
-    type->tp_descr_set = pybind11_static_set;
+  auto type = &heap_type->ht_type;
+  type->tp_name = name;
+  type->tp_base = type_incref(&PyProperty_Type);
+  type->tp_flags =
+      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+  type->tp_descr_get = pybind11_static_get;
+  type->tp_descr_set = pybind11_static_set;
 
-    if (PyType_Ready(type) < 0)
-        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+  if (PyType_Ready(type) < 0)
+    pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
 
-    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
-    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+  setattr((PyObject *)type, "__module__", str("pybind11_builtins"));
+  PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
 
-    return type;
+  return type;
 }
 
 #else // PYPY
 
-/** PyPy has some issues with the above C API, so we evaluate Python code instead.
-    This function will only be called once so performance isn't really a concern.
-    Return value: New reference. */
+/** PyPy has some issues with the above C API, so we evaluate Python code
+   instead. This function will only be called once so performance isn't really a
+   concern. Return value: New reference. */
 inline PyTypeObject *make_static_property_type() {
-    auto d = dict();
-    PyObject *result = PyRun_String(R"(\
+  auto d = dict();
+  PyObject *result = PyRun_String(R"(\
         class pybind11_static_property(property):
             def __get__(self, obj, cls):
                 return property.__get__(self, cls, cls)
@@ -93,530 +98,570 @@ inline PyTypeObject *make_static_property_type() {
             def __set__(self, obj, value):
                 cls = obj if isinstance(obj, type) else type(obj)
                 property.__set__(self, cls, value)
-        )", Py_file_input, d.ptr(), d.ptr()
-    );
-    if (result == nullptr)
-        throw error_already_set();
-    Py_DECREF(result);
-    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+        )",
+                                  Py_file_input, d.ptr(), d.ptr());
+  if (result == nullptr)
+    throw error_already_set();
+  Py_DECREF(result);
+  return (PyTypeObject *)d["pybind11_static_property"]
+      .cast<object>()
+      .release()
+      .ptr();
 }
 
 #endif // PYPY
 
-/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
-    By default, Python replaces the `static_property` itself, but for wrapped C++ types
-    we need to call `static_property.__set__()` in order to propagate the new value to
-    the underlying C++ data structure. */
-extern "C" inline int pybind11_meta_setattro(PyObject* obj, PyObject* name, PyObject* value) {
-    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
-    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
-    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+/** Types with static properties need to handle `Type.static_prop = x` in a
+   specific way. By default, Python replaces the `static_property` itself, but
+   for wrapped C++ types we need to call `static_property.__set__()` in order to
+   propagate the new value to the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject *obj, PyObject *name,
+                                             PyObject *value) {
+  // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the
+  // raw descriptor (`property`) instead of calling `tp_descr_get`
+  // (`property.__get__()`).
+  PyObject *descr = _PyType_Lookup((PyTypeObject *)obj, name);
 
-    // The following assignment combinations are possible:
-    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
-    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
-    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
-    const auto static_prop = (PyObject *) get_internals().static_property_type;
-    const auto call_descr_set = descr && PyObject_IsInstance(descr, static_prop)
-                                && !PyObject_IsInstance(value, static_prop);
-    if (call_descr_set) {
-        // Call `static_property.__set__()` instead of replacing the `static_property`.
+  // The following assignment combinations are possible:
+  //   1. `Type.static_prop = value`             --> descr_set:
+  //   `Type.static_prop.__set__(value)`
+  //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing
+  //   `static_prop`
+  //   3. `Type.regular_attribute = value`       --> setattro:  regular
+  //   attribute assignment
+  const auto static_prop = (PyObject *)get_internals().static_property_type;
+  const auto call_descr_set = descr &&
+                              PyObject_IsInstance(descr, static_prop) &&
+                              !PyObject_IsInstance(value, static_prop);
+  if (call_descr_set) {
+    // Call `static_property.__set__()` instead of replacing the
+    // `static_property`.
 #if !defined(PYPY_VERSION)
-        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+    return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
 #else
-        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
-            Py_DECREF(result);
-            return 0;
-        } else {
-            return -1;
-        }
-#endif
+    if (PyObject *result =
+            PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+      Py_DECREF(result);
+      return 0;
     } else {
-        // Replace existing attribute.
-        return PyType_Type.tp_setattro(obj, name, value);
+      return -1;
     }
+#endif
+  } else {
+    // Replace existing attribute.
+    return PyType_Type.tp_setattro(obj, name, value);
+  }
 }
 
 #if PY_MAJOR_VERSION >= 3
 /**
- * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
- * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
- * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
- * to do a special case bypass for PyInstanceMethod_Types.
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which
+ * prevents aliasing methods via cls.attr("m2") = cls.attr("m1"): instead the
+ * tp_descr_get returns a plain function, when called on a class, or a PyMethod,
+ * when called on an instance.  Override that behaviour here to do a special
+ * case bypass for PyInstanceMethod_Types.
  */
-extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
-    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
-    if (descr && PyInstanceMethod_Check(descr)) {
-        Py_INCREF(descr);
-        return descr;
-    }
-    else {
-        return PyType_Type.tp_getattro(obj, name);
-    }
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj,
+                                                   PyObject *name) {
+  PyObject *descr = _PyType_Lookup((PyTypeObject *)obj, name);
+  if (descr && PyInstanceMethod_Check(descr)) {
+    Py_INCREF(descr);
+    return descr;
+  } else {
+    return PyType_Type.tp_getattro(obj, name);
+  }
 }
 #endif
 
-/** This metaclass is assigned by default to all pybind11 types and is required in order
-    for static properties to function correctly. Users may override this using `py::metaclass`.
-    Return value: New reference. */
-inline PyTypeObject* make_default_metaclass() {
-    constexpr auto *name = "pybind11_type";
-    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+/** This metaclass is assigned by default to all pybind11 types and is required
+   in order for static properties to function correctly. Users may override this
+   using `py::metaclass`. Return value: New reference. */
+inline PyTypeObject *make_default_metaclass() {
+  constexpr auto *name = "pybind11_type";
+  auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
 
-    /* Danger zone: from now (and until PyType_Ready), make sure to
-       issue no Python C API calls which could potentially invoke the
-       garbage collector (the GC will call type_traverse(), which will in
-       turn find the newly constructed type in an invalid state) */
-    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
-    if (!heap_type)
-        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+  /* Danger zone: from now (and until PyType_Ready), make sure to
+     issue no Python C API calls which could potentially invoke the
+     garbage collector (the GC will call type_traverse(), which will in
+     turn find the newly constructed type in an invalid state) */
+  auto heap_type = (PyHeapTypeObject *)PyType_Type.tp_alloc(&PyType_Type, 0);
+  if (!heap_type)
+    pybind11_fail("make_default_metaclass(): error allocating metaclass!");
 
-    heap_type->ht_name = name_obj.inc_ref().ptr();
+  heap_type->ht_name = name_obj.inc_ref().ptr();
 #ifdef PYBIND11_BUILTIN_QUALNAME
-    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+  heap_type->ht_qualname = name_obj.inc_ref().ptr();
 #endif
 
-    auto type = &heap_type->ht_type;
-    type->tp_name = name;
-    type->tp_base = type_incref(&PyType_Type);
-    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+  auto type = &heap_type->ht_type;
+  type->tp_name = name;
+  type->tp_base = type_incref(&PyType_Type);
+  type->tp_flags =
+      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
 
-    type->tp_setattro = pybind11_meta_setattro;
+  type->tp_setattro = pybind11_meta_setattro;
 #if PY_MAJOR_VERSION >= 3
-    type->tp_getattro = pybind11_meta_getattro;
+  type->tp_getattro = pybind11_meta_getattro;
 #endif
 
-    if (PyType_Ready(type) < 0)
-        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+  if (PyType_Ready(type) < 0)
+    pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
 
-    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
-    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+  setattr((PyObject *)type, "__module__", str("pybind11_builtins"));
+  PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
 
-    return type;
+  return type;
 }
 
-/// For multiple inheritance types we need to recursively register/deregister base pointers for any
-/// base classes with pointers that are difference from the instance value pointer so that we can
-/// correctly recognize an offset base class pointer. This calls a function with any offset base ptrs.
-inline void traverse_offset_bases(void *valueptr, const detail::type_info *tinfo, instance *self,
-        bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
-    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
-        if (auto parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
-            for (auto &c : parent_tinfo->implicit_casts) {
-                if (c.first == tinfo->cpptype) {
-                    auto *parentptr = c.second(valueptr);
-                    if (parentptr != valueptr)
-                        f(parentptr, self);
-                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
-                    break;
-                }
-            }
+/// For multiple inheritance types we need to recursively register/deregister
+/// base pointers for any base classes with pointers that are difference from
+/// the instance value pointer so that we can correctly recognize an offset base
+/// class pointer. This calls a function with any offset base ptrs.
+inline void
+traverse_offset_bases(void *valueptr, const detail::type_info *tinfo,
+                      instance *self,
+                      bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+  for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+    if (auto parent_tinfo = get_type_info((PyTypeObject *)h.ptr())) {
+      for (auto &c : parent_tinfo->implicit_casts) {
+        if (c.first == tinfo->cpptype) {
+          auto *parentptr = c.second(valueptr);
+          if (parentptr != valueptr)
+            f(parentptr, self);
+          traverse_offset_bases(parentptr, parent_tinfo, self, f);
+          break;
         }
+      }
     }
+  }
 }
 
 inline bool register_instance_impl(void *ptr, instance *self) {
-    get_internals().registered_instances.emplace(ptr, self);
-    return true; // unused, but gives the same signature as the deregister func
+  get_internals().registered_instances.emplace(ptr, self);
+  return true; // unused, but gives the same signature as the deregister func
 }
 inline bool deregister_instance_impl(void *ptr, instance *self) {
-    auto &registered_instances = get_internals().registered_instances;
-    auto range = registered_instances.equal_range(ptr);
-    for (auto it = range.first; it != range.second; ++it) {
-        if (Py_TYPE(self) == Py_TYPE(it->second)) {
-            registered_instances.erase(it);
-            return true;
-        }
+  auto &registered_instances = get_internals().registered_instances;
+  auto range = registered_instances.equal_range(ptr);
+  for (auto it = range.first; it != range.second; ++it) {
+    if (Py_TYPE(self) == Py_TYPE(it->second)) {
+      registered_instances.erase(it);
+      return true;
     }
-    return false;
+  }
+  return false;
 }
 
-inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
-    register_instance_impl(valptr, self);
-    if (!tinfo->simple_ancestors)
-        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+inline void register_instance(instance *self, void *valptr,
+                              const type_info *tinfo) {
+  register_instance_impl(valptr, self);
+  if (!tinfo->simple_ancestors)
+    traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
 }
 
-inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
-    bool ret = deregister_instance_impl(valptr, self);
-    if (!tinfo->simple_ancestors)
-        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
-    return ret;
+inline bool deregister_instance(instance *self, void *valptr,
+                                const type_info *tinfo) {
+  bool ret = deregister_instance_impl(valptr, self);
+  if (!tinfo->simple_ancestors)
+    traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+  return ret;
 }
 
-/// Instance creation function for all pybind11 types. It allocates the internal instance layout for
-/// holding C++ objects and holders.  Allocation is done lazily (the first time the instance is cast
-/// to a reference or pointer), and initialization is done by an `__init__` function.
+/// Instance creation function for all pybind11 types. It allocates the internal
+/// instance layout for holding C++ objects and holders.  Allocation is done
+/// lazily (the first time the instance is cast to a reference or pointer), and
+/// initialization is done by an `__init__` function.
 inline PyObject *make_new_instance(PyTypeObject *type) {
 #if defined(PYPY_VERSION)
-    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited
-    // object is a a plain Python type (i.e. not derived from an extension type).  Fix it.
-    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
-    if (type->tp_basicsize < instance_size) {
-        type->tp_basicsize = instance_size;
-    }
+  // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when
+  // the first inherited object is a a plain Python type (i.e. not derived from
+  // an extension type).  Fix it.
+  ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+  if (type->tp_basicsize < instance_size) {
+    type->tp_basicsize = instance_size;
+  }
 #endif
-    PyObject *self = type->tp_alloc(type, 0);
-    auto inst = reinterpret_cast<instance *>(self);
-    // Allocate the value/holder internals:
-    inst->allocate_layout();
+  PyObject *self = type->tp_alloc(type, 0);
+  auto inst = reinterpret_cast<instance *>(self);
+  // Allocate the value/holder internals:
+  inst->allocate_layout();
 
-    inst->owned = true;
+  inst->owned = true;
 
-    return self;
+  return self;
 }
 
-/// Instance creation function for all pybind11 types. It only allocates space for the
-/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
-extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
-    return make_new_instance(type);
+/// Instance creation function for all pybind11 types. It only allocates space
+/// for the C++ object, but doesn't call the constructor -- an `__init__`
+/// function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *,
+                                                PyObject *) {
+  return make_new_instance(type);
 }
 
-/// An `__init__` function constructs the C++ object. Users should provide at least one
-/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
-/// following default function will be used which simply throws an exception.
-extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
-    PyTypeObject *type = Py_TYPE(self);
-    std::string msg;
+/// An `__init__` function constructs the C++ object. Users should provide at
+/// least one of these using `py::init` or directly with `.def(__init__, ...)`.
+/// Otherwise, the following default function will be used which simply throws
+/// an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *,
+                                           PyObject *) {
+  PyTypeObject *type = Py_TYPE(self);
+  std::string msg;
 #if defined(PYPY_VERSION)
-    msg += handle((PyObject *) type).attr("__module__").cast<std::string>() + ".";
+  msg += handle((PyObject *)type).attr("__module__").cast<std::string>() + ".";
 #endif
-    msg += type->tp_name;
-    msg += ": No constructor defined!";
-    PyErr_SetString(PyExc_TypeError, msg.c_str());
-    return -1;
+  msg += type->tp_name;
+  msg += ": No constructor defined!";
+  PyErr_SetString(PyExc_TypeError, msg.c_str());
+  return -1;
 }
 
 inline void add_patient(PyObject *nurse, PyObject *patient) {
-    auto &internals = get_internals();
-    auto instance = reinterpret_cast<detail::instance *>(nurse);
-    instance->has_patients = true;
-    Py_INCREF(patient);
-    internals.patients[nurse].push_back(patient);
+  auto &internals = get_internals();
+  auto instance = reinterpret_cast<detail::instance *>(nurse);
+  instance->has_patients = true;
+  Py_INCREF(patient);
+  internals.patients[nurse].push_back(patient);
 }
 
 inline void clear_patients(PyObject *self) {
-    auto instance = reinterpret_cast<detail::instance *>(self);
-    auto &internals = get_internals();
-    auto pos = internals.patients.find(self);
-    assert(pos != internals.patients.end());
-    // Clearing the patients can cause more Python code to run, which
-    // can invalidate the iterator. Extract the vector of patients
-    // from the unordered_map first.
-    auto patients = std::move(pos->second);
-    internals.patients.erase(pos);
-    instance->has_patients = false;
-    for (PyObject *&patient : patients)
-        Py_CLEAR(patient);
+  auto instance = reinterpret_cast<detail::instance *>(self);
+  auto &internals = get_internals();
+  auto pos = internals.patients.find(self);
+  assert(pos != internals.patients.end());
+  // Clearing the patients can cause more Python code to run, which
+  // can invalidate the iterator. Extract the vector of patients
+  // from the unordered_map first.
+  auto patients = std::move(pos->second);
+  internals.patients.erase(pos);
+  instance->has_patients = false;
+  for (PyObject *&patient : patients)
+    Py_CLEAR(patient);
 }
 
-/// Clears all internal data from the instance and removes it from registered instances in
-/// preparation for deallocation.
+/// Clears all internal data from the instance and removes it from registered
+/// instances in preparation for deallocation.
 inline void clear_instance(PyObject *self) {
-    auto instance = reinterpret_cast<detail::instance *>(self);
+  auto instance = reinterpret_cast<detail::instance *>(self);
 
-    // Deallocate any values/holders, if present:
-    for (auto &v_h : values_and_holders(instance)) {
-        if (v_h) {
+  // Deallocate any values/holders, if present:
+  for (auto &v_h : values_and_holders(instance)) {
+    if (v_h) {
 
-            // We have to deregister before we call dealloc because, for virtual MI types, we still
-            // need to be able to get the parent pointers.
-            if (v_h.instance_registered() && !deregister_instance(instance, v_h.value_ptr(), v_h.type))
-                pybind11_fail("pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+      // We have to deregister before we call dealloc because, for virtual MI
+      // types, we still need to be able to get the parent pointers.
+      if (v_h.instance_registered() &&
+          !deregister_instance(instance, v_h.value_ptr(), v_h.type))
+        pybind11_fail("pybind11_object_dealloc(): Tried to deallocate "
+                      "unregistered instance!");
 
-            if (instance->owned || v_h.holder_constructed())
-                v_h.type->dealloc(v_h);
-        }
+      if (instance->owned || v_h.holder_constructed())
+        v_h.type->dealloc(v_h);
     }
-    // Deallocate the value/holder layout internals:
-    instance->deallocate_layout();
+  }
+  // Deallocate the value/holder layout internals:
+  instance->deallocate_layout();
 
-    if (instance->weakrefs)
-        PyObject_ClearWeakRefs(self);
+  if (instance->weakrefs)
+    PyObject_ClearWeakRefs(self);
 
-    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
-    if (dict_ptr)
-        Py_CLEAR(*dict_ptr);
+  PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+  if (dict_ptr)
+    Py_CLEAR(*dict_ptr);
 
-    if (instance->has_patients)
-        clear_patients(self);
+  if (instance->has_patients)
+    clear_patients(self);
 }
 
-/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
-/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+/// Instance destructor function for all pybind11 types. It calls
+/// `type_info.dealloc` to destroy the C++ object itself, while the rest is
+/// Python bookkeeping.
 extern "C" inline void pybind11_object_dealloc(PyObject *self) {
-    clear_instance(self);
+  clear_instance(self);
 
-    auto type = Py_TYPE(self);
-    type->tp_free(self);
+  auto type = Py_TYPE(self);
+  type->tp_free(self);
 
-    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
-    // as part of a derived type's dealloc, in which case we're not allowed to decref
-    // the type here. For cross-module compatibility, we shouldn't compare directly
-    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
-    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
-    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
-        Py_DECREF(type);
+  // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+  // as part of a derived type's dealloc, in which case we're not allowed to
+  // decref the type here. For cross-module compatibility, we shouldn't compare
+  // directly with `pybind11_object_dealloc`, but with the common one stashed in
+  // internals.
+  auto pybind11_object_type = (PyTypeObject *)get_internals().instance_base;
+  if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+    Py_DECREF(type);
 }
 
 /** Create the type which can be used as a common base for all classes.  This is
     needed in order to satisfy Python's requirements for multiple inheritance.
     Return value: New reference. */
 inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
-    constexpr auto *name = "pybind11_object";
-    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+  constexpr auto *name = "pybind11_object";
+  auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
 
-    /* Danger zone: from now (and until PyType_Ready), make sure to
-       issue no Python C API calls which could potentially invoke the
-       garbage collector (the GC will call type_traverse(), which will in
-       turn find the newly constructed type in an invalid state) */
-    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
-    if (!heap_type)
-        pybind11_fail("make_object_base_type(): error allocating type!");
+  /* Danger zone: from now (and until PyType_Ready), make sure to
+     issue no Python C API calls which could potentially invoke the
+     garbage collector (the GC will call type_traverse(), which will in
+     turn find the newly constructed type in an invalid state) */
+  auto heap_type = (PyHeapTypeObject *)metaclass->tp_alloc(metaclass, 0);
+  if (!heap_type)
+    pybind11_fail("make_object_base_type(): error allocating type!");
 
-    heap_type->ht_name = name_obj.inc_ref().ptr();
+  heap_type->ht_name = name_obj.inc_ref().ptr();
 #ifdef PYBIND11_BUILTIN_QUALNAME
-    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+  heap_type->ht_qualname = name_obj.inc_ref().ptr();
 #endif
 
-    auto type = &heap_type->ht_type;
-    type->tp_name = name;
-    type->tp_base = type_incref(&PyBaseObject_Type);
-    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
-    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+  auto type = &heap_type->ht_type;
+  type->tp_name = name;
+  type->tp_base = type_incref(&PyBaseObject_Type);
+  type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+  type->tp_flags =
+      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
 
-    type->tp_new = pybind11_object_new;
-    type->tp_init = pybind11_object_init;
-    type->tp_dealloc = pybind11_object_dealloc;
+  type->tp_new = pybind11_object_new;
+  type->tp_init = pybind11_object_init;
+  type->tp_dealloc = pybind11_object_dealloc;
 
-    /* Support weak references (needed for the keep_alive feature) */
-    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+  /* Support weak references (needed for the keep_alive feature) */
+  type->tp_weaklistoffset = offsetof(instance, weakrefs);
 
-    if (PyType_Ready(type) < 0)
-        pybind11_fail("PyType_Ready failed in make_object_base_type():" + error_string());
+  if (PyType_Ready(type) < 0)
+    pybind11_fail("PyType_Ready failed in make_object_base_type():" +
+                  error_string());
 
-    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
-    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+  setattr((PyObject *)type, "__module__", str("pybind11_builtins"));
+  PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
 
-    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
-    return (PyObject *) heap_type;
+  assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+  return (PyObject *)heap_type;
 }
 
 /// dynamic_attr: Support for `d = instance.__dict__`.
 extern "C" inline PyObject *pybind11_get_dict(PyObject *self, void *) {
-    PyObject *&dict = *_PyObject_GetDictPtr(self);
-    if (!dict)
-        dict = PyDict_New();
-    Py_XINCREF(dict);
-    return dict;
+  PyObject *&dict = *_PyObject_GetDictPtr(self);
+  if (!dict)
+    dict = PyDict_New();
+  Py_XINCREF(dict);
+  return dict;
 }
 
 /// dynamic_attr: Support for `instance.__dict__ = dict()`.
-extern "C" inline int pybind11_set_dict(PyObject *self, PyObject *new_dict, void *) {
-    if (!PyDict_Check(new_dict)) {
-        PyErr_Format(PyExc_TypeError, "__dict__ must be set to a dictionary, not a '%.200s'",
-                     Py_TYPE(new_dict)->tp_name);
-        return -1;
-    }
-    PyObject *&dict = *_PyObject_GetDictPtr(self);
-    Py_INCREF(new_dict);
-    Py_CLEAR(dict);
-    dict = new_dict;
-    return 0;
+extern "C" inline int pybind11_set_dict(PyObject *self, PyObject *new_dict,
+                                        void *) {
+  if (!PyDict_Check(new_dict)) {
+    PyErr_Format(PyExc_TypeError,
+                 "__dict__ must be set to a dictionary, not a '%.200s'",
+                 Py_TYPE(new_dict)->tp_name);
+    return -1;
+  }
+  PyObject *&dict = *_PyObject_GetDictPtr(self);
+  Py_INCREF(new_dict);
+  Py_CLEAR(dict);
+  dict = new_dict;
+  return 0;
 }
 
-/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
-extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
-    PyObject *&dict = *_PyObject_GetDictPtr(self);
-    Py_VISIT(dict);
-    return 0;
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance
+/// `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit,
+                                        void *arg) {
+  PyObject *&dict = *_PyObject_GetDictPtr(self);
+  Py_VISIT(dict);
+  return 0;
 }
 
 /// dynamic_attr: Allow the GC to clear the dictionary.
 extern "C" inline int pybind11_clear(PyObject *self) {
-    PyObject *&dict = *_PyObject_GetDictPtr(self);
-    Py_CLEAR(dict);
-    return 0;
+  PyObject *&dict = *_PyObject_GetDictPtr(self);
+  Py_CLEAR(dict);
+  return 0;
 }
 
 /// Give instances of this type a `__dict__` and opt into garbage collection.
 inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
-    auto type = &heap_type->ht_type;
+  auto type = &heap_type->ht_type;
 #if defined(PYPY_VERSION)
-    pybind11_fail(std::string(type->tp_name) + ": dynamic attributes are "
-                                               "currently not supported in "
-                                               "conjunction with PyPy!");
+  pybind11_fail(std::string(type->tp_name) + ": dynamic attributes are "
+                                             "currently not supported in "
+                                             "conjunction with PyPy!");
 #endif
-    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
-    type->tp_dictoffset = type->tp_basicsize; // place dict at the end
-    type->tp_basicsize += (ssize_t)sizeof(PyObject *); // and allocate enough space for it
-    type->tp_traverse = pybind11_traverse;
-    type->tp_clear = pybind11_clear;
+  type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+  type->tp_dictoffset = type->tp_basicsize; // place dict at the end
+  type->tp_basicsize +=
+      (ssize_t)sizeof(PyObject *); // and allocate enough space for it
+  type->tp_traverse = pybind11_traverse;
+  type->tp_clear = pybind11_clear;
 
-    static PyGetSetDef getset[] = {
-        {const_cast<char*>("__dict__"), pybind11_get_dict, pybind11_set_dict, nullptr, nullptr},
-        {nullptr, nullptr, nullptr, nullptr, nullptr}
-    };
-    type->tp_getset = getset;
+  static PyGetSetDef getset[] = {{const_cast<char *>("__dict__"),
+                                  pybind11_get_dict, pybind11_set_dict, nullptr,
+                                  nullptr},
+                                 {nullptr, nullptr, nullptr, nullptr, nullptr}};
+  type->tp_getset = getset;
 }
 
 /// buffer_protocol: Fill in the view as specified by flags.
-extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
-    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
-    type_info *tinfo = nullptr;
-    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
-        tinfo = get_type_info((PyTypeObject *) type.ptr());
-        if (tinfo && tinfo->get_buffer)
-            break;
-    }
-    if (view == nullptr || !tinfo || !tinfo->get_buffer) {
-        if (view)
-            view->obj = nullptr;
-        PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
-        return -1;
-    }
-    std::memset(view, 0, sizeof(Py_buffer));
-    buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
-    view->obj = obj;
-    view->ndim = 1;
-    view->internal = info;
-    view->buf = info->ptr;
-    view->itemsize = info->itemsize;
-    view->len = view->itemsize;
-    for (auto s : info->shape)
-        view->len *= s;
-    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
-        view->format = const_cast<char *>(info->format.c_str());
-    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
-        view->ndim = (int) info->ndim;
-        view->strides = &info->strides[0];
-        view->shape = &info->shape[0];
-    }
-    Py_INCREF(view->obj);
-    return 0;
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view,
+                                         int flags) {
+  // Look for a `get_buffer` implementation in this type's info or any bases
+  // (following MRO).
+  type_info *tinfo = nullptr;
+  for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+    tinfo = get_type_info((PyTypeObject *)type.ptr());
+    if (tinfo && tinfo->get_buffer)
+      break;
+  }
+  if (view == nullptr || !tinfo || !tinfo->get_buffer) {
+    if (view)
+      view->obj = nullptr;
+    PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+    return -1;
+  }
+  std::memset(view, 0, sizeof(Py_buffer));
+  buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+  view->obj = obj;
+  view->ndim = 1;
+  view->internal = info;
+  view->buf = info->ptr;
+  view->itemsize = info->itemsize;
+  view->len = view->itemsize;
+  for (auto s : info->shape)
+    view->len *= s;
+  if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
+    view->format = const_cast<char *>(info->format.c_str());
+  if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+    view->ndim = (int)info->ndim;
+    view->strides = &info->strides[0];
+    view->shape = &info->shape[0];
+  }
+  Py_INCREF(view->obj);
+  return 0;
 }
 
 /// buffer_protocol: Release the resources of the buffer.
 extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
-    delete (buffer_info *) view->internal;
+  delete (buffer_info *)view->internal;
 }
 
 /// Give this type a buffer interface.
 inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
-    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+  heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
 #if PY_MAJOR_VERSION < 3
-    heap_type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER;
+  heap_type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER;
 #endif
 
-    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
-    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+  heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+  heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
 }
 
 /** Create a brand new Python type according to the `type_record` specification.
     Return value: New reference. */
-inline PyObject* make_new_python_type(const type_record &rec) {
-    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+inline PyObject *make_new_python_type(const type_record &rec) {
+  auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
 
-    auto qualname = name;
-    if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) {
+  auto qualname = name;
+  if (rec.scope && !PyModule_Check(rec.scope.ptr()) &&
+      hasattr(rec.scope, "__qualname__")) {
 #if PY_MAJOR_VERSION >= 3
-        qualname = reinterpret_steal<object>(
-            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+    qualname = reinterpret_steal<object>(PyUnicode_FromFormat(
+        "%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
 #else
-        qualname = str(rec.scope.attr("__qualname__").cast<std::string>() + "." + rec.name);
+    qualname = str(rec.scope.attr("__qualname__").cast<std::string>() + "." +
+                   rec.name);
 #endif
-    }
+  }
 
-    object module;
-    if (rec.scope) {
-        if (hasattr(rec.scope, "__module__"))
-            module = rec.scope.attr("__module__");
-        else if (hasattr(rec.scope, "__name__"))
-            module = rec.scope.attr("__name__");
-    }
+  object module;
+  if (rec.scope) {
+    if (hasattr(rec.scope, "__module__"))
+      module = rec.scope.attr("__module__");
+    else if (hasattr(rec.scope, "__name__"))
+      module = rec.scope.attr("__name__");
+  }
 
-    auto full_name = c_str(
+  auto full_name = c_str(
 #if !defined(PYPY_VERSION)
-        module ? str(module).cast<std::string>() + "." + rec.name :
+      module ? str(module).cast<std::string>() + "." + rec.name :
 #endif
-        rec.name);
+             rec.name);
 
-    char *tp_doc = nullptr;
-    if (rec.doc && options::show_user_defined_docstrings()) {
-        /* Allocate memory for docstring (using PyObject_MALLOC, since
-           Python will free this later on) */
-        size_t size = strlen(rec.doc) + 1;
-        tp_doc = (char *) PyObject_MALLOC(size);
-        memcpy((void *) tp_doc, rec.doc, size);
-    }
+  char *tp_doc = nullptr;
+  if (rec.doc && options::show_user_defined_docstrings()) {
+    /* Allocate memory for docstring (using PyObject_MALLOC, since
+       Python will free this later on) */
+    size_t size = strlen(rec.doc) + 1;
+    tp_doc = (char *)PyObject_MALLOC(size);
+    memcpy((void *)tp_doc, rec.doc, size);
+  }
 
-    auto &internals = get_internals();
-    auto bases = tuple(rec.bases);
-    auto base = (bases.size() == 0) ? internals.instance_base
-                                    : bases[0].ptr();
+  auto &internals = get_internals();
+  auto bases = tuple(rec.bases);
+  auto base = (bases.size() == 0) ? internals.instance_base : bases[0].ptr();
 
-    /* Danger zone: from now (and until PyType_Ready), make sure to
-       issue no Python C API calls which could potentially invoke the
-       garbage collector (the GC will call type_traverse(), which will in
-       turn find the newly constructed type in an invalid state) */
-    auto metaclass = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr()
-                                         : internals.default_metaclass;
+  /* Danger zone: from now (and until PyType_Ready), make sure to
+     issue no Python C API calls which could potentially invoke the
+     garbage collector (the GC will call type_traverse(), which will in
+     turn find the newly constructed type in an invalid state) */
+  auto metaclass = rec.metaclass.ptr() ? (PyTypeObject *)rec.metaclass.ptr()
+                                       : internals.default_metaclass;
 
-    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
-    if (!heap_type)
-        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+  auto heap_type = (PyHeapTypeObject *)metaclass->tp_alloc(metaclass, 0);
+  if (!heap_type)
+    pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
 
-    heap_type->ht_name = name.release().ptr();
+  heap_type->ht_name = name.release().ptr();
 #ifdef PYBIND11_BUILTIN_QUALNAME
-    heap_type->ht_qualname = qualname.inc_ref().ptr();
+  heap_type->ht_qualname = qualname.inc_ref().ptr();
 #endif
 
-    auto type = &heap_type->ht_type;
-    type->tp_name = full_name;
-    type->tp_doc = tp_doc;
-    type->tp_base = type_incref((PyTypeObject *)base);
-    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
-    if (bases.size() > 0)
-        type->tp_bases = bases.release().ptr();
+  auto type = &heap_type->ht_type;
+  type->tp_name = full_name;
+  type->tp_doc = tp_doc;
+  type->tp_base = type_incref((PyTypeObject *)base);
+  type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+  if (bases.size() > 0)
+    type->tp_bases = bases.release().ptr();
 
-    /* Don't inherit base __init__ */
-    type->tp_init = pybind11_object_init;
+  /* Don't inherit base __init__ */
+  type->tp_init = pybind11_object_init;
 
-    /* Supported protocols */
-    type->tp_as_number = &heap_type->as_number;
-    type->tp_as_sequence = &heap_type->as_sequence;
-    type->tp_as_mapping = &heap_type->as_mapping;
+  /* Supported protocols */
+  type->tp_as_number = &heap_type->as_number;
+  type->tp_as_sequence = &heap_type->as_sequence;
+  type->tp_as_mapping = &heap_type->as_mapping;
 
-    /* Flags */
-    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+  /* Flags */
+  type->tp_flags |=
+      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
 #if PY_MAJOR_VERSION < 3
-    type->tp_flags |= Py_TPFLAGS_CHECKTYPES;
+  type->tp_flags |= Py_TPFLAGS_CHECKTYPES;
 #endif
 
-    if (rec.dynamic_attr)
-        enable_dynamic_attributes(heap_type);
+  if (rec.dynamic_attr)
+    enable_dynamic_attributes(heap_type);
 
-    if (rec.buffer_protocol)
-        enable_buffer_protocol(heap_type);
+  if (rec.buffer_protocol)
+    enable_buffer_protocol(heap_type);
 
-    if (PyType_Ready(type) < 0)
-        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed (" + error_string() + ")!");
+  if (PyType_Ready(type) < 0)
+    pybind11_fail(std::string(rec.name) + ": PyType_Ready failed (" +
+                  error_string() + ")!");
 
-    assert(rec.dynamic_attr ? PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)
-                            : !PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+  assert(rec.dynamic_attr ? PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)
+                          : !PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
 
-    /* Register type with the parent scope */
-    if (rec.scope)
-        setattr(rec.scope, rec.name, (PyObject *) type);
-    else
-        Py_INCREF(type); // Keep it alive forever (reference leak)
+  /* Register type with the parent scope */
+  if (rec.scope)
+    setattr(rec.scope, rec.name, (PyObject *)type);
+  else
+    Py_INCREF(type); // Keep it alive forever (reference leak)
 
-    if (module) // Needed by pydoc
-        setattr((PyObject *) type, "__module__", module);
+  if (module) // Needed by pydoc
+    setattr((PyObject *)type, "__module__", module);
 
-    PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
+  PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
 
-    return (PyObject *) type;
+  return (PyObject *)type;
 }
 
 NAMESPACE_END(detail)
diff --git a/python/src/pybind11/detail/common.h b/python/src/pybind11/detail/common.h
index bec8ccf3b..c874a1779 100644
--- a/python/src/pybind11/detail/common.h
+++ b/python/src/pybind11/detail/common.h
@@ -10,103 +10,105 @@
 #pragma once
 
 #if !defined(NAMESPACE_BEGIN)
-#  define NAMESPACE_BEGIN(name) namespace name {
+#define NAMESPACE_BEGIN(name) namespace name {
 #endif
 #if !defined(NAMESPACE_END)
-#  define NAMESPACE_END(name) }
+#define NAMESPACE_END(name) }
 #endif
 
-// Robust support for some features and loading modules compiled against different pybind versions
-// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute on
-// the main `pybind11` namespace.
+// Robust support for some features and loading modules compiled against
+// different pybind versions requires forcing hidden visibility on pybind code,
+// so we enforce this by setting the attribute on the main `pybind11` namespace.
 #if !defined(PYBIND11_NAMESPACE)
-#  ifdef __GNUG__
-#    define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
-#  else
-#    define PYBIND11_NAMESPACE pybind11
-#  endif
+#ifdef __GNUG__
+#define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#else
+#define PYBIND11_NAMESPACE pybind11
+#endif
 #endif
 
 #if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER)
-#  if __cplusplus >= 201402L
-#    define PYBIND11_CPP14
-#    if __cplusplus >= 201703L
-#      define PYBIND11_CPP17
-#    endif
-#  endif
+#if __cplusplus >= 201402L
+#define PYBIND11_CPP14
+#if __cplusplus >= 201703L
+#define PYBIND11_CPP17
+#endif
+#endif
 #elif defined(_MSC_VER) && __cplusplus == 199711L
-// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented)
-// Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3 or newer
-#  if _MSVC_LANG >= 201402L
-#    define PYBIND11_CPP14
-#    if _MSVC_LANG > 201402L && _MSC_VER >= 1910
-#      define PYBIND11_CPP17
-#    endif
-#  endif
+// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard
+// is fully implemented) Unless you use the /Zc:__cplusplus flag on Visual
+// Studio 2017 15.7 Preview 3 or newer
+#if _MSVC_LANG >= 201402L
+#define PYBIND11_CPP14
+#if _MSVC_LANG > 201402L && _MSC_VER >= 1910
+#define PYBIND11_CPP17
+#endif
+#endif
 #endif
 
 // Compiler version assertions
 #if defined(__INTEL_COMPILER)
-#  if __INTEL_COMPILER < 1700
-#    error pybind11 requires Intel C++ compiler v17 or newer
-#  endif
+#if __INTEL_COMPILER < 1700
+#error pybind11 requires Intel C++ compiler v17 or newer
+#endif
 #elif defined(__clang__) && !defined(__apple_build_version__)
-#  if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
-#    error pybind11 requires clang 3.3 or newer
-#  endif
+#if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
+#error pybind11 requires clang 3.3 or newer
+#endif
 #elif defined(__clang__)
-// Apple changes clang version macros to its Xcode version; the first Xcode release based on
-// (upstream) clang 3.3 was Xcode 5:
-#  if __clang_major__ < 5
-#    error pybind11 requires Xcode/clang 5.0 or newer
-#  endif
+// Apple changes clang version macros to its Xcode version; the first Xcode
+// release based on (upstream) clang 3.3 was Xcode 5:
+#if __clang_major__ < 5
+#error pybind11 requires Xcode/clang 5.0 or newer
+#endif
 #elif defined(__GNUG__)
-#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
-#    error pybind11 requires gcc 4.8 or newer
-#  endif
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#error pybind11 requires gcc 4.8 or newer
+#endif
 #elif defined(_MSC_VER)
-// Pybind hits various compiler bugs in 2015u2 and earlier, and also makes use of some stl features
-// (e.g. std::negation) added in 2015u3:
-#  if _MSC_FULL_VER < 190024210
-#    error pybind11 requires MSVC 2015 update 3 or newer
-#  endif
+// Pybind hits various compiler bugs in 2015u2 and earlier, and also makes use
+// of some stl features (e.g. std::negation) added in 2015u3:
+#if _MSC_FULL_VER < 190024210
+#error pybind11 requires MSVC 2015 update 3 or newer
+#endif
 #endif
 
 #if !defined(PYBIND11_EXPORT)
-#  if defined(WIN32) || defined(_WIN32)
-#    define PYBIND11_EXPORT __declspec(dllexport)
-#  else
-#    define PYBIND11_EXPORT __attribute__ ((visibility("default")))
-#  endif
+#if defined(WIN32) || defined(_WIN32)
+#define PYBIND11_EXPORT __declspec(dllexport)
+#else
+#define PYBIND11_EXPORT __attribute__((visibility("default")))
+#endif
 #endif
 
 #if defined(_MSC_VER)
-#  define PYBIND11_NOINLINE __declspec(noinline)
+#define PYBIND11_NOINLINE __declspec(noinline)
 #else
-#  define PYBIND11_NOINLINE __attribute__ ((noinline))
+#define PYBIND11_NOINLINE __attribute__((noinline))
 #endif
 
 #if defined(PYBIND11_CPP14)
-#  define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
+#define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
 #else
-#  define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
+#define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
 #endif
 
 #define PYBIND11_VERSION_MAJOR 2
 #define PYBIND11_VERSION_MINOR 3
 #define PYBIND11_VERSION_PATCH 0
 
-/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
+/// Include Python header, disable linking to pythonX_d.lib on Windows in debug
+/// mode
 #if defined(_MSC_VER)
-#  if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 4)
-#    define HAVE_ROUND 1
-#  endif
-#  pragma warning(push)
-#  pragma warning(disable: 4510 4610 4512 4005)
-#  if defined(_DEBUG)
-#    define PYBIND11_DEBUG_MARKER
-#    undef _DEBUG
-#  endif
+#if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 4)
+#define HAVE_ROUND 1
+#endif
+#pragma warning(push)
+#pragma warning(disable : 4510 4610 4512 4005)
+#if defined(_DEBUG)
+#define PYBIND11_DEBUG_MARKER
+#undef _DEBUG
+#endif
 #endif
 
 #include <Python.h>
@@ -114,38 +116,38 @@
 #include <pythread.h>
 
 #if defined(_WIN32) && (defined(min) || defined(max))
-#  error Macro clash with min and max -- define NOMINMAX when compiling your program on Windows
+#error Macro clash with min and max -- define NOMINMAX when compiling your program on Windows
 #endif
 
 #if defined(isalnum)
-#  undef isalnum
-#  undef isalpha
-#  undef islower
-#  undef isspace
-#  undef isupper
-#  undef tolower
-#  undef toupper
+#undef isalnum
+#undef isalpha
+#undef islower
+#undef isspace
+#undef isupper
+#undef tolower
+#undef toupper
 #endif
 
 #if defined(_MSC_VER)
-#  if defined(PYBIND11_DEBUG_MARKER)
-#    define _DEBUG
-#    undef PYBIND11_DEBUG_MARKER
-#  endif
-#  pragma warning(pop)
+#if defined(PYBIND11_DEBUG_MARKER)
+#define _DEBUG
+#undef PYBIND11_DEBUG_MARKER
+#endif
+#pragma warning(pop)
 #endif
 
 #include <cstddef>
 #include <cstring>
 #include <forward_list>
-#include <vector>
-#include <string>
-#include <stdexcept>
-#include <unordered_set>
-#include <unordered_map>
 #include <memory>
-#include <typeindex>
+#include <stdexcept>
+#include <string>
 #include <type_traits>
+#include <typeindex>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
 
 #if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
 #define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
@@ -159,8 +161,8 @@
 #define PYBIND11_BYTES_SIZE PyBytes_Size
 #define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
 #define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
-#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) o)
-#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) o)
+#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t)o)
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t)o)
 #define PYBIND11_BYTES_NAME "bytes"
 #define PYBIND11_STRING_NAME "str"
 #define PYBIND11_SLICE_OBJECT PyObject
@@ -168,11 +170,12 @@
 #define PYBIND11_STR_TYPE ::pybind11::str
 #define PYBIND11_BOOL_ATTR "__bool__"
 #define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
-#define PYBIND11_PLUGIN_IMPL(name) \
-    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
+#define PYBIND11_PLUGIN_IMPL(name)                                             \
+  extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
 
 #else
-#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyMethod_New(ptr, nullptr, class_)
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_)                              \
+  PyMethod_New(ptr, nullptr, class_)
 #define PYBIND11_INSTANCE_METHOD_CHECK PyMethod_Check
 #define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyMethod_GET_FUNCTION
 #define PYBIND11_BYTES_CHECK PyString_Check
@@ -182,9 +185,12 @@
 #define PYBIND11_BYTES_AS_STRING PyString_AsString
 #define PYBIND11_BYTES_SIZE PyString_Size
 #define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
-#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
-#define PYBIND11_LONG_FROM_SIGNED(o) PyInt_FromSsize_t((ssize_t) o) // Returns long if needed.
-#define PYBIND11_LONG_FROM_UNSIGNED(o) PyInt_FromSize_t((size_t) o) // Returns long if needed.
+#define PYBIND11_LONG_AS_LONGLONG(o)                                           \
+  (PyInt_Check(o) ? (long long)PyLong_AsLong(o) : PyLong_AsLongLong(o))
+#define PYBIND11_LONG_FROM_SIGNED(o)                                           \
+  PyInt_FromSsize_t((ssize_t)o) // Returns long if needed.
+#define PYBIND11_LONG_FROM_UNSIGNED(o)                                         \
+  PyInt_FromSize_t((size_t)o) // Returns long if needed.
 #define PYBIND11_BYTES_NAME "str"
 #define PYBIND11_STRING_NAME "unicode"
 #define PYBIND11_SLICE_OBJECT PySliceObject
@@ -192,57 +198,62 @@
 #define PYBIND11_STR_TYPE ::pybind11::bytes
 #define PYBIND11_BOOL_ATTR "__nonzero__"
 #define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_nonzero)
-#define PYBIND11_PLUGIN_IMPL(name) \
-    static PyObject *pybind11_init_wrapper();               \
-    extern "C" PYBIND11_EXPORT void init##name() {          \
-        (void)pybind11_init_wrapper();                      \
-    }                                                       \
-    PyObject *pybind11_init_wrapper()
+#define PYBIND11_PLUGIN_IMPL(name)                                             \
+  static PyObject *pybind11_init_wrapper();                                    \
+  extern "C" PYBIND11_EXPORT void init##name() {                               \
+    (void)pybind11_init_wrapper();                                             \
+  }                                                                            \
+  PyObject *pybind11_init_wrapper()
 #endif
 
 #if PY_VERSION_HEX >= 0x03050000 && PY_VERSION_HEX < 0x03050200
 extern "C" {
-    struct _Py_atomic_address { void *value; };
-    PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current;
+struct _Py_atomic_address {
+  void *value;
+};
+PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current;
 }
 #endif
 
-#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
+#define PYBIND11_TRY_NEXT_OVERLOAD                                             \
+  ((PyObject *)1) // special failure return code
 #define PYBIND11_STRINGIFY(x) #x
 #define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
 #define PYBIND11_CONCAT(first, second) first##second
 
-#define PYBIND11_CHECK_PYTHON_VERSION \
-    {                                                                          \
-        const char *compiled_ver = PYBIND11_TOSTRING(PY_MAJOR_VERSION)         \
-            "." PYBIND11_TOSTRING(PY_MINOR_VERSION);                           \
-        const char *runtime_ver = Py_GetVersion();                             \
-        size_t len = std::strlen(compiled_ver);                                \
-        if (std::strncmp(runtime_ver, compiled_ver, len) != 0                  \
-                || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) {     \
-            PyErr_Format(PyExc_ImportError,                                    \
-                "Python version mismatch: module was compiled for Python %s, " \
-                "but the interpreter version is incompatible: %s.",            \
-                compiled_ver, runtime_ver);                                    \
-            return nullptr;                                                    \
-        }                                                                      \
-    }
+#define PYBIND11_CHECK_PYTHON_VERSION                                          \
+  {                                                                            \
+    const char *compiled_ver = PYBIND11_TOSTRING(                              \
+        PY_MAJOR_VERSION) "." PYBIND11_TOSTRING(PY_MINOR_VERSION);             \
+    const char *runtime_ver = Py_GetVersion();                                 \
+    size_t len = std::strlen(compiled_ver);                                    \
+    if (std::strncmp(runtime_ver, compiled_ver, len) != 0 ||                   \
+        (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) {                \
+      PyErr_Format(                                                            \
+          PyExc_ImportError,                                                   \
+          "Python version mismatch: module was compiled for Python %s, "       \
+          "but the interpreter version is incompatible: %s.",                  \
+          compiled_ver, runtime_ver);                                          \
+      return nullptr;                                                          \
+    }                                                                          \
+  }
 
-#define PYBIND11_CATCH_INIT_EXCEPTIONS \
-        catch (pybind11::error_already_set &e) {                               \
-            PyErr_SetString(PyExc_ImportError, e.what());                      \
-            return nullptr;                                                    \
-        } catch (const std::exception &e) {                                    \
-            PyErr_SetString(PyExc_ImportError, e.what());                      \
-            return nullptr;                                                    \
-        }                                                                      \
+#define PYBIND11_CATCH_INIT_EXCEPTIONS                                         \
+  catch (pybind11::error_already_set & e) {                                    \
+    PyErr_SetString(PyExc_ImportError, e.what());                              \
+    return nullptr;                                                            \
+  }                                                                            \
+  catch (const std::exception &e) {                                            \
+    PyErr_SetString(PyExc_ImportError, e.what());                              \
+    return nullptr;                                                            \
+  }
 
 /** \rst
     ***Deprecated in favor of PYBIND11_MODULE***
 
-    This macro creates the entry point that will be invoked when the Python interpreter
-    imports a plugin library. Please create a `module` in the function body and return
-    the pointer to its underlying Python object at the end.
+    This macro creates the entry point that will be invoked when the Python
+interpreter imports a plugin library. Please create a `module` in the function
+body and return the pointer to its underlying Python object at the end.
 
     .. code-block:: cpp
 
@@ -253,21 +264,22 @@ extern "C" {
         }
 \endrst */
 #define PYBIND11_PLUGIN(name)                                                  \
-    PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")  \
-    static PyObject *pybind11_init();                                          \
-    PYBIND11_PLUGIN_IMPL(name) {                                               \
-        PYBIND11_CHECK_PYTHON_VERSION                                          \
-        try {                                                                  \
-            return pybind11_init();                                            \
-        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+  PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")    \
+  static PyObject *pybind11_init();                                            \
+  PYBIND11_PLUGIN_IMPL(name) {                                                 \
+    PYBIND11_CHECK_PYTHON_VERSION                                              \
+    try {                                                                      \
+      return pybind11_init();                                                  \
     }                                                                          \
-    PyObject *pybind11_init()
+    PYBIND11_CATCH_INIT_EXCEPTIONS                                             \
+  }                                                                            \
+  PyObject *pybind11_init()
 
 /** \rst
-    This macro creates the entry point that will be invoked when the Python interpreter
-    imports an extension module. The module name is given as the fist argument and it
-    should not be in quotes. The second macro argument defines a variable of type
-    `py::module` which can be used to initialize the module.
+    This macro creates the entry point that will be invoked when the Python
+interpreter imports an extension module. The module name is given as the fist
+argument and it should not be in quotes. The second macro argument defines a
+variable of type `py::module` which can be used to initialize the module.
 
     .. code-block:: cpp
 
@@ -281,92 +293,98 @@ extern "C" {
         }
 \endrst */
 #define PYBIND11_MODULE(name, variable)                                        \
-    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);     \
-    PYBIND11_PLUGIN_IMPL(name) {                                               \
-        PYBIND11_CHECK_PYTHON_VERSION                                          \
-        auto m = pybind11::module(PYBIND11_TOSTRING(name));                    \
-        try {                                                                  \
-            PYBIND11_CONCAT(pybind11_init_, name)(m);                          \
-            return m.ptr();                                                    \
-        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+  static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);       \
+  PYBIND11_PLUGIN_IMPL(name) {                                                 \
+    PYBIND11_CHECK_PYTHON_VERSION                                              \
+    auto m = pybind11::module(PYBIND11_TOSTRING(name));                        \
+    try {                                                                      \
+      PYBIND11_CONCAT(pybind11_init_, name)(m);                                \
+      return m.ptr();                                                          \
     }                                                                          \
-    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
-
+    PYBIND11_CATCH_INIT_EXCEPTIONS                                             \
+  }                                                                            \
+  void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module & variable)
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
 using ssize_t = Py_ssize_t;
-using size_t  = std::size_t;
+using size_t = std::size_t;
 
 /// Approach used to cast a previously unknown C++ instance into a Python object
 enum class return_value_policy : uint8_t {
-    /** This is the default return value policy, which falls back to the policy
-        return_value_policy::take_ownership when the return value is a pointer.
-        Otherwise, it uses return_value::move or return_value::copy for rvalue
-        and lvalue references, respectively. See below for a description of what
-        all of these different policies do. */
-    automatic = 0,
+  /** This is the default return value policy, which falls back to the policy
+      return_value_policy::take_ownership when the return value is a pointer.
+      Otherwise, it uses return_value::move or return_value::copy for rvalue
+      and lvalue references, respectively. See below for a description of what
+      all of these different policies do. */
+  automatic = 0,
 
-    /** As above, but use policy return_value_policy::reference when the return
-        value is a pointer. This is the default conversion policy for function
-        arguments when calling Python functions manually from C++ code (i.e. via
-        handle::operator()). You probably won't need to use this. */
-    automatic_reference,
+  /** As above, but use policy return_value_policy::reference when the return
+      value is a pointer. This is the default conversion policy for function
+      arguments when calling Python functions manually from C++ code (i.e. via
+      handle::operator()). You probably won't need to use this. */
+  automatic_reference,
 
-    /** Reference an existing object (i.e. do not create a new copy) and take
-        ownership. Python will call the destructor and delete operator when the
-        object’s reference count reaches zero. Undefined behavior ensues when
-        the C++ side does the same.. */
-    take_ownership,
+  /** Reference an existing object (i.e. do not create a new copy) and take
+      ownership. Python will call the destructor and delete operator when the
+      object’s reference count reaches zero. Undefined behavior ensues when
+      the C++ side does the same.. */
+  take_ownership,
 
-    /** Create a new copy of the returned object, which will be owned by
-        Python. This policy is comparably safe because the lifetimes of the two
-        instances are decoupled. */
-    copy,
+  /** Create a new copy of the returned object, which will be owned by
+      Python. This policy is comparably safe because the lifetimes of the two
+      instances are decoupled. */
+  copy,
 
-    /** Use std::move to move the return value contents into a new instance
-        that will be owned by Python. This policy is comparably safe because the
-        lifetimes of the two instances (move source and destination) are
-        decoupled. */
-    move,
+  /** Use std::move to move the return value contents into a new instance
+      that will be owned by Python. This policy is comparably safe because the
+      lifetimes of the two instances (move source and destination) are
+      decoupled. */
+  move,
 
-    /** Reference an existing object, but do not take ownership. The C++ side
-        is responsible for managing the object’s lifetime and deallocating it
-        when it is no longer used. Warning: undefined behavior will ensue when
-        the C++ side deletes an object that is still referenced and used by
-        Python. */
-    reference,
+  /** Reference an existing object, but do not take ownership. The C++ side
+      is responsible for managing the object’s lifetime and deallocating it
+      when it is no longer used. Warning: undefined behavior will ensue when
+      the C++ side deletes an object that is still referenced and used by
+      Python. */
+  reference,
 
-    /** This policy only applies to methods and properties. It references the
-        object without taking ownership similar to the above
-        return_value_policy::reference policy. In contrast to that policy, the
-        function or property’s implicit this argument (called the parent) is
-        considered to be the the owner of the return value (the child).
-        pybind11 then couples the lifetime of the parent to the child via a
-        reference relationship that ensures that the parent cannot be garbage
-        collected while Python is still using the child. More advanced
-        variations of this scheme are also possible using combinations of
-        return_value_policy::reference and the keep_alive call policy */
-    reference_internal
+  /** This policy only applies to methods and properties. It references the
+      object without taking ownership similar to the above
+      return_value_policy::reference policy. In contrast to that policy, the
+      function or property’s implicit this argument (called the parent) is
+      considered to be the the owner of the return value (the child).
+      pybind11 then couples the lifetime of the parent to the child via a
+      reference relationship that ensures that the parent cannot be garbage
+      collected while Python is still using the child. More advanced
+      variations of this scheme are also possible using combinations of
+      return_value_policy::reference and the keep_alive call policy */
+  reference_internal
 };
 
 NAMESPACE_BEGIN(detail)
 
-inline static constexpr int log2(size_t n, int k = 0) { return (n <= 1) ? k : log2(n >> 1, k + 1); }
+inline static constexpr int log2(size_t n, int k = 0) {
+  return (n <= 1) ? k : log2(n >> 1, k + 1);
+}
 
 // Returns the size as a multiple of sizeof(void *), rounded up.
-inline static constexpr size_t size_in_ptrs(size_t s) { return 1 + ((s - 1) >> log2(sizeof(void *))); }
+inline static constexpr size_t size_in_ptrs(size_t s) {
+  return 1 + ((s - 1) >> log2(sizeof(void *)));
+}
 
 /**
- * The space to allocate for simple layout instance holders (see below) in multiple of the size of
- * a pointer (e.g.  2 means 16 bytes on 64-bit architectures).  The default is the minimum required
- * to holder either a std::unique_ptr or std::shared_ptr (which is almost always
+ * The space to allocate for simple layout instance holders (see below) in
+ * multiple of the size of a pointer (e.g.  2 means 16 bytes on 64-bit
+ * architectures).  The default is the minimum required to holder either a
+ * std::unique_ptr or std::shared_ptr (which is almost always
  * sizeof(std::shared_ptr<T>)).
  */
 constexpr size_t instance_simple_holder_in_ptrs() {
-    static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
-            "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs");
-    return size_in_ptrs(sizeof(std::shared_ptr<int>));
+  static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
+                "pybind assumes std::shared_ptrs are at least as big as "
+                "std::unique_ptrs");
+  return size_in_ptrs(sizeof(std::shared_ptr<int>));
 }
 
 // Forward declarations
@@ -374,82 +392,96 @@ struct type_info;
 struct value_and_holder;
 
 struct nonsimple_values_and_holders {
-    void **values_and_holders;
-    uint8_t *status;
+  void **values_and_holders;
+  uint8_t *status;
 };
 
-/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof')
+/// The 'instance' type which needs to be standard layout (need to be able to
+/// use 'offsetof')
 struct instance {
-    PyObject_HEAD
-    /// Storage for pointers and holder; see simple_layout, below, for a description
-    union {
-        void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
-        nonsimple_values_and_holders nonsimple;
-    };
-    /// Weak references
-    PyObject *weakrefs;
-    /// If true, the pointer is owned which means we're free to manage it with a holder.
-    bool owned : 1;
-    /**
-     * An instance has two possible value/holder layouts.
-     *
-     * Simple layout (when this flag is true), means the `simple_value_holder` is set with a pointer
-     * and the holder object governing that pointer, i.e. [val1*][holder].  This layout is applied
-     * whenever there is no python-side multiple inheritance of bound C++ types *and* the type's
-     * holder will fit in the default space (which is large enough to hold either a std::unique_ptr
-     * or std::shared_ptr).
-     *
-     * Non-simple layout applies when using custom holders that require more space than `shared_ptr`
-     * (which is typically the size of two pointers), or when multiple inheritance is used on the
-     * python side.  Non-simple layout allocates the required amount of memory to have multiple
-     * bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is set to a
-     * pointer to allocated space of the required space to hold a sequence of value pointers and
-     * holders followed `status`, a set of bit flags (1 byte each), i.e.
-     * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple of
-     * `sizeof(void *)`.  `nonsimple.status` is, for convenience, a pointer to the
-     * beginning of the [bb...] block (but not independently allocated).
-     *
-     * Status bits indicate whether the associated holder is constructed (&
-     * status_holder_constructed) and whether the value pointer is registered (&
-     * status_instance_registered) in `registered_instances`.
-     */
-    bool simple_layout : 1;
-    /// For simple layout, tracks whether the holder has been constructed
-    bool simple_holder_constructed : 1;
-    /// For simple layout, tracks whether the instance is registered in `registered_instances`
-    bool simple_instance_registered : 1;
-    /// If true, get_internals().patients has an entry for this object
-    bool has_patients : 1;
+  PyObject_HEAD
+      /// Storage for pointers and holder; see simple_layout, below, for a
+      /// description
+      union {
+    void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
+    nonsimple_values_and_holders nonsimple;
+  };
+  /// Weak references
+  PyObject *weakrefs;
+  /// If true, the pointer is owned which means we're free to manage it with a
+  /// holder.
+  bool owned : 1;
+  /**
+   * An instance has two possible value/holder layouts.
+   *
+   * Simple layout (when this flag is true), means the `simple_value_holder` is
+   * set with a pointer and the holder object governing that pointer, i.e.
+   * [val1*][holder].  This layout is applied whenever there is no python-side
+   * multiple inheritance of bound C++ types *and* the type's holder will fit in
+   * the default space (which is large enough to hold either a std::unique_ptr
+   * or std::shared_ptr).
+   *
+   * Non-simple layout applies when using custom holders that require more space
+   * than `shared_ptr` (which is typically the size of two pointers), or when
+   * multiple inheritance is used on the python side.  Non-simple layout
+   * allocates the required amount of memory to have multiple bound C++ classes
+   * as parents.  Under this layout, `nonsimple.values_and_holders` is set to a
+   * pointer to allocated space of the required space to hold a sequence of
+   * value pointers and holders followed `status`, a set of bit flags (1 byte
+   * each), i.e. [val1*][holder1][val2*][holder2]...[bb...]  where each [block]
+   * is rounded up to a multiple of `sizeof(void *)`.  `nonsimple.status` is,
+   * for convenience, a pointer to the beginning of the [bb...] block (but not
+   * independently allocated).
+   *
+   * Status bits indicate whether the associated holder is constructed (&
+   * status_holder_constructed) and whether the value pointer is registered (&
+   * status_instance_registered) in `registered_instances`.
+   */
+  bool simple_layout : 1;
+  /// For simple layout, tracks whether the holder has been constructed
+  bool simple_holder_constructed : 1;
+  /// For simple layout, tracks whether the instance is registered in
+  /// `registered_instances`
+  bool simple_instance_registered : 1;
+  /// If true, get_internals().patients has an entry for this object
+  bool has_patients : 1;
 
-    /// Initializes all of the above type/values/holders data (but not the instance values themselves)
-    void allocate_layout();
+  /// Initializes all of the above type/values/holders data (but not the
+  /// instance values themselves)
+  void allocate_layout();
 
-    /// Destroys/deallocates all of the above
-    void deallocate_layout();
+  /// Destroys/deallocates all of the above
+  void deallocate_layout();
 
-    /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
-    /// omitted).  Returns a default-constructed (with `.inst = nullptr`) object on failure if
-    /// `throw_if_missing` is false.
-    value_and_holder get_value_and_holder(const type_info *find_type = nullptr, bool throw_if_missing = true);
+  /// Returns the value_and_holder wrapper for the given type (or the first, if
+  /// `find_type` omitted).  Returns a default-constructed (with `.inst =
+  /// nullptr`) object on failure if `throw_if_missing` is false.
+  value_and_holder get_value_and_holder(const type_info *find_type = nullptr,
+                                        bool throw_if_missing = true);
 
-    /// Bit values for the non-simple status flags
-    static constexpr uint8_t status_holder_constructed  = 1;
-    static constexpr uint8_t status_instance_registered = 2;
+  /// Bit values for the non-simple status flags
+  static constexpr uint8_t status_holder_constructed = 1;
+  static constexpr uint8_t status_instance_registered = 2;
 };
 
-static_assert(std::is_standard_layout<instance>::value, "Internal error: `pybind11::detail::instance` is not standard layout!");
+static_assert(
+    std::is_standard_layout<instance>::value,
+    "Internal error: `pybind11::detail::instance` is not standard layout!");
 
 /// from __cpp_future__ import (convenient aliases from C++14/17)
 #if defined(PYBIND11_CPP14) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
-using std::enable_if_t;
 using std::conditional_t;
+using std::enable_if_t;
 using std::remove_cv_t;
 using std::remove_reference_t;
 #else
-template <bool B, typename T = void> using enable_if_t = typename std::enable_if<B, T>::type;
-template <bool B, typename T, typename F> using conditional_t = typename std::conditional<B, T, F>::type;
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F>
+using conditional_t = typename std::conditional<B, T, F>::type;
 template <typename T> using remove_cv_t = typename std::remove_cv<T>::type;
-template <typename T> using remove_reference_t = typename std::remove_reference<T>::type;
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
 #endif
 
 /// Index sequences
@@ -457,198 +489,265 @@ template <typename T> using remove_reference_t = typename std::remove_reference<
 using std::index_sequence;
 using std::make_index_sequence;
 #else
-template<size_t ...> struct index_sequence  { };
-template<size_t N, size_t ...S> struct make_index_sequence_impl : make_index_sequence_impl <N - 1, N - 1, S...> { };
-template<size_t ...S> struct make_index_sequence_impl <0, S...> { typedef index_sequence<S...> type; };
-template<size_t N> using make_index_sequence = typename make_index_sequence_impl<N>::type;
+template <size_t...> struct index_sequence {};
+template <size_t N, size_t... S>
+struct make_index_sequence_impl : make_index_sequence_impl<N - 1, N - 1, S...> {
+};
+template <size_t... S> struct make_index_sequence_impl<0, S...> {
+  typedef index_sequence<S...> type;
+};
+template <size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
 #endif
 
 /// Make an index sequence of the indices of true arguments
-template <typename ISeq, size_t, bool...> struct select_indices_impl { using type = ISeq; };
-template <size_t... IPrev, size_t I, bool B, bool... Bs> struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
-    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>, I + 1, Bs...> {};
-template <bool... Bs> using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+template <typename ISeq, size_t, bool...> struct select_indices_impl {
+  using type = ISeq;
+};
+template <size_t... IPrev, size_t I, bool B, bool... Bs>
+struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>,
+                                        index_sequence<IPrev...>>,
+                          I + 1, Bs...> {};
+template <bool... Bs>
+using select_indices =
+    typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
 
-/// Backports of std::bool_constant and std::negation to accommodate older compilers
+/// Backports of std::bool_constant and std::negation to accommodate older
+/// compilers
 template <bool B> using bool_constant = std::integral_constant<bool, B>;
-template <typename T> struct negation : bool_constant<!T::value> { };
+template <typename T> struct negation : bool_constant<!T::value> {};
 
 template <typename...> struct void_t_impl { using type = void; };
 template <typename... Ts> using void_t = typename void_t_impl<Ts...>::type;
 
-/// Compile-time all/any/none of that check the boolean value of all template types
+/// Compile-time all/any/none of that check the boolean value of all template
+/// types
 #if defined(__cpp_fold_expressions) && !(defined(_MSC_VER) && (_MSC_VER < 1916))
 template <class... Ts> using all_of = bool_constant<(Ts::value && ...)>;
 template <class... Ts> using any_of = bool_constant<(Ts::value || ...)>;
 #elif !defined(_MSC_VER)
 template <bool...> struct bools {};
-template <class... Ts> using all_of = std::is_same<
-    bools<Ts::value..., true>,
-    bools<true, Ts::value...>>;
+template <class... Ts>
+using all_of =
+    std::is_same<bools<Ts::value..., true>, bools<true, Ts::value...>>;
 template <class... Ts> using any_of = negation<all_of<negation<Ts>...>>;
 #else
-// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
-// at a slight loss of compilation efficiency).
+// MSVC has trouble with the above, but supports std::conjunction, which we can
+// use instead (albeit at a slight loss of compilation efficiency).
 template <class... Ts> using all_of = std::conjunction<Ts...>;
 template <class... Ts> using any_of = std::disjunction<Ts...>;
 #endif
 template <class... Ts> using none_of = negation<any_of<Ts...>>;
 
-template <class T, template<class> class... Predicates> using satisfies_all_of = all_of<Predicates<T>...>;
-template <class T, template<class> class... Predicates> using satisfies_any_of = any_of<Predicates<T>...>;
-template <class T, template<class> class... Predicates> using satisfies_none_of = none_of<Predicates<T>...>;
+template <class T, template <class> class... Predicates>
+using satisfies_all_of = all_of<Predicates<T>...>;
+template <class T, template <class> class... Predicates>
+using satisfies_any_of = any_of<Predicates<T>...>;
+template <class T, template <class> class... Predicates>
+using satisfies_none_of = none_of<Predicates<T>...>;
 
 /// Strip the class from a method type
-template <typename T> struct remove_class { };
-template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...)> { typedef R type(A...); };
-template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...) const> { typedef R type(A...); };
+template <typename T> struct remove_class {};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...)> {
+  typedef R type(A...);
+};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...) const> {
+  typedef R type(A...);
+};
 
 /// Helper template to strip away type modifiers
-template <typename T> struct intrinsic_type                       { typedef T type; };
-template <typename T> struct intrinsic_type<const T>              { typedef typename intrinsic_type<T>::type type; };
-template <typename T> struct intrinsic_type<T*>                   { typedef typename intrinsic_type<T>::type type; };
-template <typename T> struct intrinsic_type<T&>                   { typedef typename intrinsic_type<T>::type type; };
-template <typename T> struct intrinsic_type<T&&>                  { typedef typename intrinsic_type<T>::type type; };
-template <typename T, size_t N> struct intrinsic_type<const T[N]> { typedef typename intrinsic_type<T>::type type; };
-template <typename T, size_t N> struct intrinsic_type<T[N]>       { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type { typedef T type; };
+template <typename T> struct intrinsic_type<const T> {
+  typedef typename intrinsic_type<T>::type type;
+};
+template <typename T> struct intrinsic_type<T *> {
+  typedef typename intrinsic_type<T>::type type;
+};
+template <typename T> struct intrinsic_type<T &> {
+  typedef typename intrinsic_type<T>::type type;
+};
+template <typename T> struct intrinsic_type<T &&> {
+  typedef typename intrinsic_type<T>::type type;
+};
+template <typename T, size_t N> struct intrinsic_type<const T[N]> {
+  typedef typename intrinsic_type<T>::type type;
+};
+template <typename T, size_t N> struct intrinsic_type<T[N]> {
+  typedef typename intrinsic_type<T>::type type;
+};
 template <typename T> using intrinsic_t = typename intrinsic_type<T>::type;
 
 /// Helper type to replace 'void' in some expressions
-struct void_type { };
+struct void_type {};
 
 /// Helper template which holds a list of types
-template <typename...> struct type_list { };
+template <typename...> struct type_list {};
 
 /// Compile-time integer sum
 #ifdef __cpp_fold_expressions
-template <typename... Ts> constexpr size_t constexpr_sum(Ts... ns) { return (0 + ... + size_t{ns}); }
+template <typename... Ts> constexpr size_t constexpr_sum(Ts... ns) {
+  return (0 + ... + size_t{ns});
+}
 #else
 constexpr size_t constexpr_sum() { return 0; }
 template <typename T, typename... Ts>
-constexpr size_t constexpr_sum(T n, Ts... ns) { return size_t{n} + constexpr_sum(ns...); }
+constexpr size_t constexpr_sum(T n, Ts... ns) {
+  return size_t{n} + constexpr_sum(ns...);
+}
 #endif
 
 NAMESPACE_BEGIN(constexpr_impl)
 /// Implementation details for constexpr functions
 constexpr int first(int i) { return i; }
 template <typename T, typename... Ts>
-constexpr int first(int i, T v, Ts... vs) { return v ? i : first(i + 1, vs...); }
+constexpr int first(int i, T v, Ts... vs) {
+  return v ? i : first(i + 1, vs...);
+}
 
 constexpr int last(int /*i*/, int result) { return result; }
 template <typename T, typename... Ts>
-constexpr int last(int i, int result, T v, Ts... vs) { return last(i + 1, v ? i : result, vs...); }
+constexpr int last(int i, int result, T v, Ts... vs) {
+  return last(i + 1, v ? i : result, vs...);
+}
 NAMESPACE_END(constexpr_impl)
 
-/// Return the index of the first type in Ts which satisfies Predicate<T>.  Returns sizeof...(Ts) if
-/// none match.
-template <template<typename> class Predicate, typename... Ts>
-constexpr int constexpr_first() { return constexpr_impl::first(0, Predicate<Ts>::value...); }
+/// Return the index of the first type in Ts which satisfies Predicate<T>.
+/// Returns sizeof...(Ts) if none match.
+template <template <typename> class Predicate, typename... Ts>
+constexpr int constexpr_first() {
+  return constexpr_impl::first(0, Predicate<Ts>::value...);
+}
 
-/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1 if none match.
-template <template<typename> class Predicate, typename... Ts>
-constexpr int constexpr_last() { return constexpr_impl::last(0, -1, Predicate<Ts>::value...); }
+/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1
+/// if none match.
+template <template <typename> class Predicate, typename... Ts>
+constexpr int constexpr_last() {
+  return constexpr_impl::last(0, -1, Predicate<Ts>::value...);
+}
 
 /// Return the Nth element from the parameter pack
-template <size_t N, typename T, typename... Ts>
-struct pack_element { using type = typename pack_element<N - 1, Ts...>::type; };
-template <typename T, typename... Ts>
-struct pack_element<0, T, Ts...> { using type = T; };
-
-/// Return the one and only type which matches the predicate, or Default if none match.
-/// If more than one type matches the predicate, fail at compile-time.
-template <template<typename> class Predicate, typename Default, typename... Ts>
-struct exactly_one {
-    static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
-    static_assert(found <= 1, "Found more than one type matching the predicate");
-
-    static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
-    using type = conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+template <size_t N, typename T, typename... Ts> struct pack_element {
+  using type = typename pack_element<N - 1, Ts...>::type;
+};
+template <typename T, typename... Ts> struct pack_element<0, T, Ts...> {
+  using type = T;
 };
-template <template<typename> class P, typename Default>
-struct exactly_one<P, Default> { using type = Default; };
 
-template <template<typename> class Predicate, typename Default, typename... Ts>
+/// Return the one and only type which matches the predicate, or Default if none
+/// match. If more than one type matches the predicate, fail at compile-time.
+template <template <typename> class Predicate, typename Default, typename... Ts>
+struct exactly_one {
+  static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
+  static_assert(found <= 1, "Found more than one type matching the predicate");
+
+  static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
+  using type =
+      conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+};
+template <template <typename> class P, typename Default>
+struct exactly_one<P, Default> {
+  using type = Default;
+};
+
+template <template <typename> class Predicate, typename Default, typename... Ts>
 using exactly_one_t = typename exactly_one<Predicate, Default, Ts...>::type;
 
 /// Defer the evaluation of type T until types Us are instantiated
-template <typename T, typename... /*Us*/> struct deferred_type { using type = T; };
-template <typename T, typename... Us> using deferred_t = typename deferred_type<T, Us...>::type;
+template <typename T, typename... /*Us*/> struct deferred_type {
+  using type = T;
+};
+template <typename T, typename... Us>
+using deferred_t = typename deferred_type<T, Us...>::type;
 
-/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T, T>::value == false`,
-/// unlike `std::is_base_of`)
-template <typename Base, typename Derived> using is_strict_base_of = bool_constant<
-    std::is_base_of<Base, Derived>::value && !std::is_same<Base, Derived>::value>;
+/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T,
+/// T>::value == false`, unlike `std::is_base_of`)
+template <typename Base, typename Derived>
+using is_strict_base_of = bool_constant<std::is_base_of<Base, Derived>::value &&
+                                        !std::is_same<Base, Derived>::value>;
 
-/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived pointer
-/// can be converted to a Base pointer)
-template <typename Base, typename Derived> using is_accessible_base_of = bool_constant<
-    std::is_base_of<Base, Derived>::value && std::is_convertible<Derived *, Base *>::value>;
+/// Like is_base_of, but also requires that the base type is accessible (i.e.
+/// that a Derived pointer can be converted to a Base pointer)
+template <typename Base, typename Derived>
+using is_accessible_base_of =
+    bool_constant<std::is_base_of<Base, Derived>::value &&
+                  std::is_convertible<Derived *, Base *>::value>;
 
-template <template<typename...> class Base>
-struct is_template_base_of_impl {
-    template <typename... Us> static std::true_type check(Base<Us...> *);
-    static std::false_type check(...);
+template <template <typename...> class Base> struct is_template_base_of_impl {
+  template <typename... Us> static std::true_type check(Base<Us...> *);
+  static std::false_type check(...);
 };
 
 /// Check if a template is the base of a type. For example:
-/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
-template <template<typename...> class Base, typename T>
+/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U
+/// can be anything
+template <template <typename...> class Base, typename T>
 #if !defined(_MSC_VER)
-using is_template_base_of = decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr));
+using is_template_base_of =
+    decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T> *)nullptr));
 #else // MSVC2015 has trouble with decltype in template aliases
-struct is_template_base_of : decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr)) { };
+struct is_template_base_of : decltype(is_template_base_of_impl<Base>::check(
+                                 (intrinsic_t<T> *)nullptr)) {
+};
 #endif
 
 /// Check if T is an instantiation of the template `Class`. For example:
-/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U can be anything.
-template <template<typename...> class Class, typename T>
-struct is_instantiation : std::false_type { };
-template <template<typename...> class Class, typename... Us>
-struct is_instantiation<Class, Class<Us...>> : std::true_type { };
+/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U
+/// can be anything.
+template <template <typename...> class Class, typename T>
+struct is_instantiation : std::false_type {};
+template <template <typename...> class Class, typename... Us>
+struct is_instantiation<Class, Class<Us...>> : std::true_type {};
 
 /// Check if T is std::shared_ptr<U> where U can be anything
-template <typename T> using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
+template <typename T>
+using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
 
 /// Check if T looks like an input iterator
-template <typename T, typename = void> struct is_input_iterator : std::false_type {};
+template <typename T, typename = void>
+struct is_input_iterator : std::false_type {};
 template <typename T>
-struct is_input_iterator<T, void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
+struct is_input_iterator<
+    T, void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
     : std::true_type {};
 
-template <typename T> using is_function_pointer = bool_constant<
-    std::is_pointer<T>::value && std::is_function<typename std::remove_pointer<T>::type>::value>;
+template <typename T>
+using is_function_pointer = bool_constant<
+    std::is_pointer<T>::value &&
+    std::is_function<typename std::remove_pointer<T>::type>::value>;
 
 template <typename F> struct strip_function_object {
-    using type = typename remove_class<decltype(&F::operator())>::type;
+  using type = typename remove_class<decltype(&F::operator())>::type;
 };
 
 // Extracts the function signature from a function, function pointer or lambda.
 template <typename Function, typename F = remove_reference_t<Function>>
 using function_signature_t = conditional_t<
-    std::is_function<F>::value,
-    F,
+    std::is_function<F>::value, F,
     typename conditional_t<
         std::is_pointer<F>::value || std::is_member_pointer<F>::value,
-        std::remove_pointer<F>,
-        strip_function_object<F>
-    >::type
->;
+        std::remove_pointer<F>, strip_function_object<F>>::type>;
 
-/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member
-/// pointer.  Note that this can catch all sorts of other things, too; this is intended to be used
-/// in a place where passing a lambda makes sense.
-template <typename T> using is_lambda = satisfies_none_of<remove_reference_t<T>,
-        std::is_function, std::is_pointer, std::is_member_pointer>;
+/// Returns true if the type looks like a lambda: that is, isn't a function,
+/// pointer or member pointer.  Note that this can catch all sorts of other
+/// things, too; this is intended to be used in a place where passing a lambda
+/// makes sense.
+template <typename T>
+using is_lambda = satisfies_none_of<remove_reference_t<T>, std::is_function,
+                                    std::is_pointer, std::is_member_pointer>;
 
 /// Ignore that a variable is unused in compiler warnings
-inline void ignore_unused(const int *) { }
+inline void ignore_unused(const int *) {}
 
 /// Apply a function over each element of a parameter pack
 #ifdef __cpp_fold_expressions
 #define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
 #else
 using expand_side_effects = bool[];
-#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) pybind11::detail::expand_side_effects{ ((PATTERN), void(), false)..., false }
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN)                                  \
+  pybind11::detail::expand_side_effects { ((PATTERN), void(), false)..., false }
 #endif
 
 NAMESPACE_END(detail)
@@ -656,89 +755,120 @@ NAMESPACE_END(detail)
 /// C++ bindings of builtin Python exceptions
 class builtin_exception : public std::runtime_error {
 public:
-    using std::runtime_error::runtime_error;
-    /// Set the error using the Python C API
-    virtual void set_error() const = 0;
+  using std::runtime_error::runtime_error;
+  /// Set the error using the Python C API
+  virtual void set_error() const = 0;
 };
 
-#define PYBIND11_RUNTIME_EXCEPTION(name, type) \
-    class name : public builtin_exception { public: \
-        using builtin_exception::builtin_exception; \
-        name() : name("") { } \
-        void set_error() const override { PyErr_SetString(type, what()); } \
-    };
+#define PYBIND11_RUNTIME_EXCEPTION(name, type)                                 \
+  class name : public builtin_exception {                                      \
+  public:                                                                      \
+    using builtin_exception::builtin_exception;                                \
+    name() : name("") {}                                                       \
+    void set_error() const override { PyErr_SetString(type, what()); }         \
+  };
 
 PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
 PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
 PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
 PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
 PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
-PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due to a type casting error
-PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
+PYBIND11_RUNTIME_EXCEPTION(
+    cast_error,
+    PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due
+                        /// to a type casting error
+PYBIND11_RUNTIME_EXCEPTION(reference_cast_error,
+                           PyExc_RuntimeError) /// Used internally
 
-[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) { throw std::runtime_error(reason); }
-[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); }
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) {
+  throw std::runtime_error(reason);
+}
+[[noreturn]] PYBIND11_NOINLINE inline void
+pybind11_fail(const std::string &reason) {
+  throw std::runtime_error(reason);
+}
 
-template <typename T, typename SFINAE = void> struct format_descriptor { };
+template <typename T, typename SFINAE = void> struct format_descriptor {};
 
 NAMESPACE_BEGIN(detail)
-// Returns the index of the given type in the type char array below, and in the list in numpy.h
-// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double;
-// complex float,double,long double.  Note that the long double types only participate when long
-// double is actually longer than double (it isn't under MSVC).
-// NB: not only the string below but also complex.h and numpy.h rely on this order.
-template <typename T, typename SFINAE = void> struct is_fmt_numeric { static constexpr bool value = false; };
-template <typename T> struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
-    static constexpr bool value = true;
-    static constexpr int index = std::is_same<T, bool>::value ? 0 : 1 + (
-        std::is_integral<T>::value ? detail::log2(sizeof(T))*2 + std::is_unsigned<T>::value : 8 + (
-        std::is_same<T, double>::value ? 1 : std::is_same<T, long double>::value ? 2 : 0));
+// Returns the index of the given type in the type char array below, and in the
+// list in numpy.h The order here is: bool; 8 ints
+// ((signed,unsigned)x(8,16,32,64)bits); float,double,long double; complex
+// float,double,long double.  Note that the long double types only participate
+// when long double is actually longer than double (it isn't under MSVC). NB:
+// not only the string below but also complex.h and numpy.h rely on this order.
+template <typename T, typename SFINAE = void> struct is_fmt_numeric {
+  static constexpr bool value = false;
+};
+template <typename T>
+struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
+  static constexpr bool value = true;
+  static constexpr int index =
+      std::is_same<T, bool>::value
+          ? 0
+          : 1 + (std::is_integral<T>::value
+                     ? detail::log2(sizeof(T)) * 2 + std::is_unsigned<T>::value
+                     : 8 + (std::is_same<T, double>::value
+                                ? 1
+                                : std::is_same<T, long double>::value ? 2 : 0));
 };
 NAMESPACE_END(detail)
 
-template <typename T> struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
-    static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
-    static constexpr const char value[2] = { c, '\0' };
-    static std::string format() { return std::string(1, c); }
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
+  static constexpr const char c =
+      "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
+  static constexpr const char value[2] = {c, '\0'};
+  static std::string format() { return std::string(1, c); }
 };
 
 #if !defined(PYBIND11_CPP17)
 
-template <typename T> constexpr const char format_descriptor<
+template <typename T>
+constexpr const char format_descriptor<
     T, detail::enable_if_t<std::is_arithmetic<T>::value>>::value[2];
 
 #endif
 
 /// RAII wrapper that temporarily clears any Python error state
 struct error_scope {
-    PyObject *type, *value, *trace;
-    error_scope() { PyErr_Fetch(&type, &value, &trace); }
-    ~error_scope() { PyErr_Restore(type, value, trace); }
+  PyObject *type, *value, *trace;
+  error_scope() { PyErr_Fetch(&type, &value, &trace); }
+  ~error_scope() { PyErr_Restore(type, value, trace); }
 };
 
-/// Dummy destructor wrapper that can be used to expose classes with a private destructor
-struct nodelete { template <typename T> void operator()(T*) { } };
+/// Dummy destructor wrapper that can be used to expose classes with a private
+/// destructor
+struct nodelete {
+  template <typename T> void operator()(T *) {}
+};
 
 // overload_cast requires variable templates: C++14
 #if defined(PYBIND11_CPP14)
 #define PYBIND11_OVERLOAD_CAST 1
 
 NAMESPACE_BEGIN(detail)
-template <typename... Args>
-struct overload_cast_impl {
-    constexpr overload_cast_impl() {} // MSVC 2015 needs this
+template <typename... Args> struct overload_cast_impl {
+  constexpr overload_cast_impl() {} // MSVC 2015 needs this
 
-    template <typename Return>
-    constexpr auto operator()(Return (*pf)(Args...)) const noexcept
-                              -> decltype(pf) { return pf; }
+  template <typename Return>
+  constexpr auto operator()(Return (*pf)(Args...)) const noexcept
+      -> decltype(pf) {
+    return pf;
+  }
 
-    template <typename Return, typename Class>
-    constexpr auto operator()(Return (Class::*pmf)(Args...), std::false_type = {}) const noexcept
-                              -> decltype(pmf) { return pmf; }
+  template <typename Return, typename Class>
+  constexpr auto operator()(Return (Class::*pmf)(Args...),
+                            std::false_type = {}) const noexcept
+      -> decltype(pmf) {
+    return pmf;
+  }
 
-    template <typename Return, typename Class>
-    constexpr auto operator()(Return (Class::*pmf)(Args...) const, std::true_type) const noexcept
-                              -> decltype(pmf) { return pmf; }
+  template <typename Return, typename Class>
+  constexpr auto operator()(Return (Class::*pmf)(Args...) const,
+                            std::true_type) const noexcept -> decltype(pmf) {
+    return pmf;
+  }
 };
 NAMESPACE_END(detail)
 
@@ -747,61 +877,69 @@ NAMESPACE_END(detail)
 ///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
 template <typename... Args>
 static constexpr detail::overload_cast_impl<Args...> overload_cast = {};
-// MSVC 2015 only accepts this particular initialization syntax for this variable template.
+// MSVC 2015 only accepts this particular initialization syntax for this
+// variable template.
 
 /// Const member function selector for overload_cast
 ///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
 ///  - sweet:   overload_cast<Arg>(&Class::func, const_)
 static constexpr auto const_ = std::true_type{};
 
-#else // no overload_cast: providing something that static_assert-fails:
+#else  // no overload_cast: providing something that static_assert-fails:
 template <typename... Args> struct overload_cast {
-    static_assert(detail::deferred_t<std::false_type, Args...>::value,
-                  "pybind11::overload_cast<...> requires compiling in C++14 mode");
+  static_assert(
+      detail::deferred_t<std::false_type, Args...>::value,
+      "pybind11::overload_cast<...> requires compiling in C++14 mode");
 };
 #endif // overload_cast
 
 NAMESPACE_BEGIN(detail)
 
-// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from
-// any standard container (or C-style array) supporting std::begin/std::end, any singleton
-// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair.
-template <typename T>
-class any_container {
-    std::vector<T> v;
+// Adaptor for converting arbitrary container arguments into a vector;
+// implicitly convertible from any standard container (or C-style array)
+// supporting std::begin/std::end, any singleton arithmetic type (if T is
+// arithmetic), or explicitly constructible from an iterator pair.
+template <typename T> class any_container {
+  std::vector<T> v;
+
 public:
-    any_container() = default;
+  any_container() = default;
 
-    // Can construct from a pair of iterators
-    template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
-    any_container(It first, It last) : v(first, last) { }
+  // Can construct from a pair of iterators
+  template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
+  any_container(It first, It last) : v(first, last) {}
 
-    // Implicit conversion constructor from any arbitrary container type with values convertible to T
-    template <typename Container, typename = enable_if_t<std::is_convertible<decltype(*std::begin(std::declval<const Container &>())), T>::value>>
-    any_container(const Container &c) : any_container(std::begin(c), std::end(c)) { }
+  // Implicit conversion constructor from any arbitrary container type with
+  // values convertible to T
+  template <
+      typename Container,
+      typename = enable_if_t<std::is_convertible<
+          decltype(*std::begin(std::declval<const Container &>())), T>::value>>
+  any_container(const Container &c)
+      : any_container(std::begin(c), std::end(c)) {}
 
-    // initializer_list's aren't deducible, so don't get matched by the above template; we need this
-    // to explicitly allow implicit conversion from one:
-    template <typename TIn, typename = enable_if_t<std::is_convertible<TIn, T>::value>>
-    any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) { }
+  // initializer_list's aren't deducible, so don't get matched by the above
+  // template; we need this to explicitly allow implicit conversion from one:
+  template <typename TIn,
+            typename = enable_if_t<std::is_convertible<TIn, T>::value>>
+  any_container(const std::initializer_list<TIn> &c)
+      : any_container(c.begin(), c.end()) {}
 
-    // Avoid copying if given an rvalue vector of the correct type.
-    any_container(std::vector<T> &&v) : v(std::move(v)) { }
+  // Avoid copying if given an rvalue vector of the correct type.
+  any_container(std::vector<T> &&v) : v(std::move(v)) {}
 
-    // Moves the vector out of an rvalue any_container
-    operator std::vector<T> &&() && { return std::move(v); }
+  // Moves the vector out of an rvalue any_container
+  operator std::vector<T> &&() && { return std::move(v); }
 
-    // Dereferencing obtains a reference to the underlying vector
-    std::vector<T> &operator*() { return v; }
-    const std::vector<T> &operator*() const { return v; }
+  // Dereferencing obtains a reference to the underlying vector
+  std::vector<T> &operator*() { return v; }
+  const std::vector<T> &operator*() const { return v; }
 
-    // -> lets you call methods on the underlying vector
-    std::vector<T> *operator->() { return &v; }
-    const std::vector<T> *operator->() const { return &v; }
+  // -> lets you call methods on the underlying vector
+  std::vector<T> *operator->() { return &v; }
+  const std::vector<T> *operator->() const { return &v; }
 };
 
 NAMESPACE_END(detail)
 
-
-
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/detail/descr.h b/python/src/pybind11/detail/descr.h
index 8d404e534..38f8b7433 100644
--- a/python/src/pybind11/detail/descr.h
+++ b/python/src/pybind11/detail/descr.h
@@ -1,5 +1,6 @@
 /*
-    pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time
+    pybind11/detail/descr.h: Helper type for concatenating type signatures at
+   compile time
 
     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
 
@@ -15,67 +16,82 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
 #if !defined(_MSC_VER)
-#  define PYBIND11_DESCR_CONSTEXPR static constexpr
+#define PYBIND11_DESCR_CONSTEXPR static constexpr
 #else
-#  define PYBIND11_DESCR_CONSTEXPR const
+#define PYBIND11_DESCR_CONSTEXPR const
 #endif
 
 /* Concatenate type signatures at compile time */
-template <size_t N, typename... Ts>
-struct descr {
-    char text[N + 1];
+template <size_t N, typename... Ts> struct descr {
+  char text[N + 1];
 
-    constexpr descr() : text{'\0'} { }
-    constexpr descr(char const (&s)[N+1]) : descr(s, make_index_sequence<N>()) { }
+  constexpr descr() : text{'\0'} {}
+  constexpr descr(char const (&s)[N + 1])
+      : descr(s, make_index_sequence<N>()) {}
 
-    template <size_t... Is>
-    constexpr descr(char const (&s)[N+1], index_sequence<Is...>) : text{s[Is]..., '\0'} { }
+  template <size_t... Is>
+  constexpr descr(char const (&s)[N + 1], index_sequence<Is...>)
+      : text{s[Is]..., '\0'} {}
 
-    template <typename... Chars>
-    constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} { }
+  template <typename... Chars>
+  constexpr descr(char c, Chars... cs)
+      : text{c, static_cast<char>(cs)..., '\0'} {}
 
-    static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
-        return {{&typeid(Ts)..., nullptr}};
-    }
+  static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1>
+  types() {
+    return {{&typeid(Ts)..., nullptr}};
+  }
 };
 
-template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
-constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b,
-                                                   index_sequence<Is1...>, index_sequence<Is2...>) {
-    return {a.text[Is1]..., b.text[Is2]...};
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1,
+          size_t... Is2>
+constexpr descr<N1 + N2, Ts1..., Ts2...>
+plus_impl(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b,
+          index_sequence<Is1...>, index_sequence<Is2...>) {
+  return {a.text[Is1]..., b.text[Is2]...};
 }
 
 template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
-constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b) {
-    return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
+constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a,
+                                                   const descr<N2, Ts2...> &b) {
+  return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
 }
 
-template <size_t N>
-constexpr descr<N - 1> _(char const(&text)[N]) { return descr<N - 1>(text); }
-constexpr descr<0> _(char const(&)[1]) { return {}; }
+template <size_t N> constexpr descr<N - 1> _(char const (&text)[N]) {
+  return descr<N - 1>(text);
+}
+constexpr descr<0> _(char const (&)[1]) { return {}; }
 
-template <size_t Rem, size_t... Digits> struct int_to_str : int_to_str<Rem/10, Rem%10, Digits...> { };
-template <size_t...Digits> struct int_to_str<0, Digits...> {
-    static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
+template <size_t Rem, size_t... Digits>
+struct int_to_str : int_to_str<Rem / 10, Rem % 10, Digits...> {};
+template <size_t... Digits> struct int_to_str<0, Digits...> {
+  static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
 };
 
 // Ternary description (like std::conditional)
 template <bool B, size_t N1, size_t N2>
-constexpr enable_if_t<B, descr<N1 - 1>> _(char const(&text1)[N1], char const(&)[N2]) {
-    return _(text1);
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const (&text1)[N1],
+                                          char const (&)[N2]) {
+  return _(text1);
 }
 template <bool B, size_t N1, size_t N2>
-constexpr enable_if_t<!B, descr<N2 - 1>> _(char const(&)[N1], char const(&text2)[N2]) {
-    return _(text2);
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const (&)[N1],
+                                           char const (&text2)[N2]) {
+  return _(text2);
 }
 
 template <bool B, typename T1, typename T2>
-constexpr enable_if_t<B, T1> _(const T1 &d, const T2 &) { return d; }
+constexpr enable_if_t<B, T1> _(const T1 &d, const T2 &) {
+  return d;
+}
 template <bool B, typename T1, typename T2>
-constexpr enable_if_t<!B, T2> _(const T1 &, const T2 &d) { return d; }
+constexpr enable_if_t<!B, T2> _(const T1 &, const T2 &d) {
+  return d;
+}
 
-template <size_t Size> auto constexpr _() -> decltype(int_to_str<Size / 10, Size % 10>::digits) {
-    return int_to_str<Size / 10, Size % 10>::digits;
+template <size_t Size>
+auto constexpr _() -> decltype(int_to_str<Size / 10, Size % 10>::digits) {
+  return int_to_str<Size / 10, Size % 10>::digits;
 }
 
 template <typename Type> constexpr descr<1, Type> _() { return {'%'}; }
@@ -83,17 +99,19 @@ template <typename Type> constexpr descr<1, Type> _() { return {'%'}; }
 constexpr descr<0> concat() { return {}; }
 
 template <size_t N, typename... Ts>
-constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) { return descr; }
+constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) {
+  return descr;
+}
 
 template <size_t N, typename... Ts, typename... Args>
-constexpr auto concat(const descr<N, Ts...> &d, const Args &...args)
+constexpr auto concat(const descr<N, Ts...> &d, const Args &... args)
     -> decltype(std::declval<descr<N + 2, Ts...>>() + concat(args...)) {
-    return d + _(", ") + concat(args...);
+  return d + _(", ") + concat(args...);
 }
 
 template <size_t N, typename... Ts>
 constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
-    return _("{") + descr + _("}");
+  return _("{") + descr + _("}");
 }
 
 NAMESPACE_END(detail)
diff --git a/python/src/pybind11/detail/init.h b/python/src/pybind11/detail/init.h
index acfe00bdb..a48dfd419 100644
--- a/python/src/pybind11/detail/init.h
+++ b/python/src/pybind11/detail/init.h
@@ -1,5 +1,6 @@
 /*
-    pybind11/detail/init.h: init factory function implementation and support code.
+    pybind11/detail/init.h: init factory function implementation and support
+   code.
 
     Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
 
@@ -14,26 +15,26 @@
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
-template <>
-class type_caster<value_and_holder> {
+template <> class type_caster<value_and_holder> {
 public:
-    bool load(handle h, bool) {
-        value = reinterpret_cast<value_and_holder *>(h.ptr());
-        return true;
-    }
+  bool load(handle h, bool) {
+    value = reinterpret_cast<value_and_holder *>(h.ptr());
+    return true;
+  }
 
-    template <typename> using cast_op_type = value_and_holder &;
-    operator value_and_holder &() { return *value; }
-    static constexpr auto name = _<value_and_holder>();
+  template <typename> using cast_op_type = value_and_holder &;
+  operator value_and_holder &() { return *value; }
+  static constexpr auto name = _<value_and_holder>();
 
 private:
-    value_and_holder *value = nullptr;
+  value_and_holder *value = nullptr;
 };
 
 NAMESPACE_BEGIN(initimpl)
 
 inline void no_nullptr(void *ptr) {
-    if (!ptr) throw type_error("pybind11::init(): factory function returned nullptr");
+  if (!ptr)
+    throw type_error("pybind11::init(): factory function returned nullptr");
 }
 
 // Implementing functions for all forms of py::init<...> and py::init(...)
@@ -41,293 +42,372 @@ template <typename Class> using Cpp = typename Class::type;
 template <typename Class> using Alias = typename Class::type_alias;
 template <typename Class> using Holder = typename Class::holder_type;
 
-template <typename Class> using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+template <typename Class>
+using is_alias_constructible =
+    std::is_constructible<Alias<Class>, Cpp<Class> &&>;
 
-// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias
+// instance.
 template <typename Class, enable_if_t<Class::has_alias, int> = 0>
 bool is_alias(Cpp<Class> *ptr) {
-    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+  return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
 }
-// Failing fallback version of the above for a no-alias class (always returns false)
-template <typename /*Class*/>
-constexpr bool is_alias(void *) { return false; }
+// Failing fallback version of the above for a no-alias class (always returns
+// false)
+template <typename /*Class*/> constexpr bool is_alias(void *) { return false; }
 
-// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall
-// back to brace aggregate initiailization so that for aggregate initialization can be used with
-// py::init, e.g.  `py::init<int, int>` to initialize a `struct T { int a; int b; }`.  For
-// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
-// works, but will not do the expected thing when `T` has an `initializer_list<T>` constructor).
-template <typename Class, typename... Args, detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
-inline Class *construct_or_initialize(Args &&...args) { return new Class(std::forward<Args>(args)...); }
-template <typename Class, typename... Args, detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
-inline Class *construct_or_initialize(Args &&...args) { return new Class{std::forward<Args>(args)...}; }
+// Constructs and returns a new object; if the given arguments don't map to a
+// constructor, we fall back to brace aggregate initiailization so that for
+// aggregate initialization can be used with py::init, e.g.  `py::init<int,
+// int>` to initialize a `struct T { int a; int b; }`.  For non-aggregate types,
+// we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
+// works, but will not do the expected thing when `T` has an
+// `initializer_list<T>` constructor).
+template <
+    typename Class, typename... Args,
+    detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&... args) {
+  return new Class(std::forward<Args>(args)...);
+}
+template <
+    typename Class, typename... Args,
+    detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&... args) {
+  return new Class{std::forward<Args>(args)...};
+}
 
-// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
-// an alias to provide only a single Cpp factory function as long as the Alias can be
-// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
-// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
-// inherit all the base class constructors.
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This
+// allows types with an alias to provide only a single Cpp factory function as
+// long as the Alias can be constructed from an rvalue reference of the base Cpp
+// type.  This means that Alias classes can, when appropriate, simply define a
+// `Alias(Cpp &&)` constructor rather than needing to inherit all the base class
+// constructors.
 template <typename Class>
 void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
                               value_and_holder &v_h, Cpp<Class> &&base) {
-    v_h.value_ptr() = new Alias<Class>(std::move(base));
+  v_h.value_ptr() = new Alias<Class>(std::move(base));
 }
 template <typename Class>
-[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
-                                           value_and_holder &, Cpp<Class> &&) {
-    throw type_error("pybind11::init(): unable to convert returned instance to required "
-                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+[[noreturn]] void
+construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                         value_and_holder &, Cpp<Class> &&) {
+  throw type_error(
+      "pybind11::init(): unable to convert returned instance to required "
+      "alias class: no `Alias<Class>(Class &&)` constructor available");
 }
 
-// Error-generating fallback for factories that don't match one of the below construction
-// mechanisms.
-template <typename Class>
-void construct(...) {
-    static_assert(!std::is_same<Class, Class>::value /* always false */,
-            "pybind11::init(): init function must return a compatible pointer, "
-            "holder, or value");
+// Error-generating fallback for factories that don't match one of the below
+// construction mechanisms.
+template <typename Class> void construct(...) {
+  static_assert(
+      !std::is_same<Class, Class>::value /* always false */,
+      "pybind11::init(): init function must return a compatible pointer, "
+      "holder, or value");
 }
 
-// Pointer return v1: the factory function returns a class pointer for a registered class.
-// If we don't need an alias (because this class doesn't have one, or because the final type is
-// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
-// construct an Alias from the returned base instance.
+// Pointer return v1: the factory function returns a class pointer for a
+// registered class. If we don't need an alias (because this class doesn't have
+// one, or because the final type is inherited on the Python side) we can simply
+// take over ownership.  Otherwise we need to try to construct an Alias from the
+// returned base instance.
 template <typename Class>
 void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
-    no_nullptr(ptr);
-    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
-        // We're going to try to construct an alias by moving the cpp type.  Whether or not
-        // that succeeds, we still need to destroy the original cpp pointer (either the
-        // moved away leftover, if the alias construction works, or the value itself if we
-        // throw an error), but we can't just call `delete ptr`: it might have a special
-        // deleter, or might be shared_from_this.  So we construct a holder around it as if
-        // it was a normal instance, then steal the holder away into a local variable; thus
-        // the holder and destruction happens when we leave the C++ scope, and the holder
-        // class gets to handle the destruction however it likes.
-        v_h.value_ptr() = ptr;
-        v_h.set_instance_registered(true); // To prevent init_instance from registering it
-        v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder
-        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
-        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
-        v_h.set_instance_registered(false);
+  no_nullptr(ptr);
+  if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+    // We're going to try to construct an alias by moving the cpp type.  Whether
+    // or not that succeeds, we still need to destroy the original cpp pointer
+    // (either the moved away leftover, if the alias construction works, or the
+    // value itself if we throw an error), but we can't just call `delete ptr`:
+    // it might have a special deleter, or might be shared_from_this.  So we
+    // construct a holder around it as if it was a normal instance, then steal
+    // the holder away into a local variable; thus the holder and destruction
+    // happens when we leave the C++ scope, and the holder class gets to handle
+    // the destruction however it likes.
+    v_h.value_ptr() = ptr;
+    v_h.set_instance_registered(
+        true); // To prevent init_instance from registering it
+    v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder
+    Holder<Class> temp_holder(
+        std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+    v_h.type->dealloc(
+        v_h); // Destroys the moved-out holder remains, resets value ptr to null
+    v_h.set_instance_registered(false);
 
-        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
-    } else {
-        // Otherwise the type isn't inherited, so we don't need an Alias
-        v_h.value_ptr() = ptr;
-    }
+    construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h,
+                                    std::move(*ptr));
+  } else {
+    // Otherwise the type isn't inherited, so we don't need an Alias
+    v_h.value_ptr() = ptr;
+  }
 }
 
-// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
-// ownership of the pointer.
+// Pointer return v2: a factory that always returns an alias instance ptr.  We
+// simply take over ownership of the pointer.
 template <typename Class, enable_if_t<Class::has_alias, int> = 0>
 void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
-    no_nullptr(alias_ptr);
-    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+  no_nullptr(alias_ptr);
+  v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
 }
 
-// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
-// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
-// derived type (through those holder's implicit conversion from derived class holder constructors).
+// Holder return: copy its pointer, and move or copy the returned holder into
+// the new instance's holder.  This also handles types like std::shared_ptr<T>
+// and std::unique_ptr<T> where T is a derived type (through those holder's
+// implicit conversion from derived class holder constructors).
 template <typename Class>
 void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
-    auto *ptr = holder_helper<Holder<Class>>::get(holder);
-    // If we need an alias, check that the held pointer is actually an alias instance
-    if (Class::has_alias && need_alias && !is_alias<Class>(ptr))
-        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
-                         "is not an alias instance");
+  auto *ptr = holder_helper<Holder<Class>>::get(holder);
+  // If we need an alias, check that the held pointer is actually an alias
+  // instance
+  if (Class::has_alias && need_alias && !is_alias<Class>(ptr))
+    throw type_error("pybind11::init(): construction failed: returned "
+                     "holder-wrapped instance "
+                     "is not an alias instance");
 
-    v_h.value_ptr() = ptr;
-    v_h.type->init_instance(v_h.inst, &holder);
+  v_h.value_ptr() = ptr;
+  v_h.type->init_instance(v_h.inst, &holder);
 }
 
-// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
-// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
-// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
-// need it, we simply move-construct the cpp value into a new instance.
+// return-by-value version 1: returning a cpp class by value.  If the class has
+// an alias and an alias is required the alias must have an `Alias(Cpp &&)`
+// constructor so that we can construct the alias from the base when needed
+// (i.e. because of Python-side inheritance).  When we don't need it, we simply
+// move-construct the cpp value into a new instance.
 template <typename Class>
 void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
-    static_assert(std::is_move_constructible<Cpp<Class>>::value,
-        "pybind11::init() return-by-value factory function requires a movable class");
-    if (Class::has_alias && need_alias)
-        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
-    else
-        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+  static_assert(std::is_move_constructible<Cpp<Class>>::value,
+                "pybind11::init() return-by-value factory function requires a "
+                "movable class");
+  if (Class::has_alias && need_alias)
+    construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h,
+                                    std::move(result));
+  else
+    v_h.value_ptr() = new Cpp<Class>(std::move(result));
 }
 
-// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
-// Alias instance (even if no the python-side inheritance is involved).  The is intended for
-// cases where Alias initialization is always desired.
+// return-by-value version 2: returning a value of the alias type itself.  We
+// move-construct an Alias instance (even if no the python-side inheritance is
+// involved).  The is intended for cases where Alias initialization is always
+// desired.
 template <typename Class>
 void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
-    static_assert(std::is_move_constructible<Alias<Class>>::value,
-        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
-    v_h.value_ptr() = new Alias<Class>(std::move(result));
+  static_assert(std::is_move_constructible<Alias<Class>>::value,
+                "pybind11::init() return-by-alias-value factory function "
+                "requires a movable alias class");
+  v_h.value_ptr() = new Alias<Class>(std::move(result));
 }
 
 // Implementing class for py::init<...>()
-template <typename... Args>
-struct constructor {
-    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
-    static void execute(Class &cl, const Extra&... extra) {
-        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
-            v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
-        }, is_new_style_constructor(), extra...);
-    }
+template <typename... Args> struct constructor {
+  template <typename Class, typename... Extra,
+            enable_if_t<!Class::has_alias, int> = 0>
+  static void execute(Class &cl, const Extra &... extra) {
+    cl.def(
+        "__init__",
+        [](value_and_holder &v_h, Args... args) {
+          v_h.value_ptr() =
+              construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+        },
+        is_new_style_constructor(), extra...);
+  }
 
-    template <typename Class, typename... Extra,
-              enable_if_t<Class::has_alias &&
-                          std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
-    static void execute(Class &cl, const Extra&... extra) {
-        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
-            if (Py_TYPE(v_h.inst) == v_h.type->type)
-                v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
-            else
-                v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
-        }, is_new_style_constructor(), extra...);
-    }
+  template <typename Class, typename... Extra,
+            enable_if_t<Class::has_alias &&
+                            std::is_constructible<Cpp<Class>, Args...>::value,
+                        int> = 0>
+  static void execute(Class &cl, const Extra &... extra) {
+    cl.def(
+        "__init__",
+        [](value_and_holder &v_h, Args... args) {
+          if (Py_TYPE(v_h.inst) == v_h.type->type)
+            v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(
+                std::forward<Args>(args)...);
+          else
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(
+                std::forward<Args>(args)...);
+        },
+        is_new_style_constructor(), extra...);
+  }
 
-    template <typename Class, typename... Extra,
-              enable_if_t<Class::has_alias &&
-                          !std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
-    static void execute(Class &cl, const Extra&... extra) {
-        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
-            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
-        }, is_new_style_constructor(), extra...);
-    }
+  template <typename Class, typename... Extra,
+            enable_if_t<Class::has_alias &&
+                            !std::is_constructible<Cpp<Class>, Args...>::value,
+                        int> = 0>
+  static void execute(Class &cl, const Extra &... extra) {
+    cl.def(
+        "__init__",
+        [](value_and_holder &v_h, Args... args) {
+          v_h.value_ptr() = construct_or_initialize<Alias<Class>>(
+              std::forward<Args>(args)...);
+        },
+        is_new_style_constructor(), extra...);
+  }
 };
 
 // Implementing class for py::init_alias<...>()
 template <typename... Args> struct alias_constructor {
-    template <typename Class, typename... Extra,
-              enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int> = 0>
-    static void execute(Class &cl, const Extra&... extra) {
-        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
-            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
-        }, is_new_style_constructor(), extra...);
-    }
+  template <typename Class, typename... Extra,
+            enable_if_t<Class::has_alias &&
+                            std::is_constructible<Alias<Class>, Args...>::value,
+                        int> = 0>
+  static void execute(Class &cl, const Extra &... extra) {
+    cl.def(
+        "__init__",
+        [](value_and_holder &v_h, Args... args) {
+          v_h.value_ptr() = construct_or_initialize<Alias<Class>>(
+              std::forward<Args>(args)...);
+        },
+        is_new_style_constructor(), extra...);
+  }
 };
 
 // Implementation class for py::init(Func) and py::init(Func, AliasFunc)
 template <typename CFunc, typename AFunc = void_type (*)(),
-          typename = function_signature_t<CFunc>, typename = function_signature_t<AFunc>>
+          typename = function_signature_t<CFunc>,
+          typename = function_signature_t<AFunc>>
 struct factory;
 
 // Specialization for py::init(Func)
 template <typename Func, typename Return, typename... Args>
 struct factory<Func, void_type (*)(), Return(Args...)> {
-    remove_reference_t<Func> class_factory;
+  remove_reference_t<Func> class_factory;
 
-    factory(Func &&f) : class_factory(std::forward<Func>(f)) { }
+  factory(Func &&f) : class_factory(std::forward<Func>(f)) {}
 
-    // The given class either has no alias or has no separate alias factory;
-    // this always constructs the class itself.  If the class is registered with an alias
-    // type and an alias instance is needed (i.e. because the final type is a Python class
-    // inheriting from the C++ type) the returned value needs to either already be an alias
-    // instance, or the alias needs to be constructible from a `Class &&` argument.
-    template <typename Class, typename... Extra>
-    void execute(Class &cl, const Extra &...extra) && {
-        #if defined(PYBIND11_CPP14)
-        cl.def("__init__", [func = std::move(class_factory)]
-        #else
-        auto &func = class_factory;
-        cl.def("__init__", [func]
-        #endif
+  // The given class either has no alias or has no separate alias factory;
+  // this always constructs the class itself.  If the class is registered with
+  // an alias type and an alias instance is needed (i.e. because the final type
+  // is a Python class inheriting from the C++ type) the returned value needs to
+  // either already be an alias instance, or the alias needs to be constructible
+  // from a `Class &&` argument.
+  template <typename Class, typename... Extra>
+  void execute(Class &cl, const Extra &... extra) && {
+#if defined(PYBIND11_CPP14)
+    cl.def(
+        "__init__",
+        [func = std::move(class_factory)]
+#else
+    auto &func = class_factory;
+    cl.def(
+        "__init__",
+        [func]
+#endif
         (value_and_holder &v_h, Args... args) {
-            construct<Class>(v_h, func(std::forward<Args>(args)...),
-                             Py_TYPE(v_h.inst) != v_h.type->type);
-        }, is_new_style_constructor(), extra...);
-    }
+          construct<Class>(v_h, func(std::forward<Args>(args)...),
+                           Py_TYPE(v_h.inst) != v_h.type->type);
+        },
+        is_new_style_constructor(), extra...);
+  }
 };
 
 // Specialization for py::init(Func, AliasFunc)
-template <typename CFunc, typename AFunc,
-          typename CReturn, typename... CArgs, typename AReturn, typename... AArgs>
+template <typename CFunc, typename AFunc, typename CReturn, typename... CArgs,
+          typename AReturn, typename... AArgs>
 struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
-    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
-                  "pybind11::init(class_factory, alias_factory): class and alias factories "
-                  "must have identical argument signatures");
-    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
-                  "pybind11::init(class_factory, alias_factory): class and alias factories "
-                  "must have identical argument signatures");
+  static_assert(
+      sizeof...(CArgs) == sizeof...(AArgs),
+      "pybind11::init(class_factory, alias_factory): class and alias factories "
+      "must have identical argument signatures");
+  static_assert(
+      all_of<std::is_same<CArgs, AArgs>...>::value,
+      "pybind11::init(class_factory, alias_factory): class and alias factories "
+      "must have identical argument signatures");
 
-    remove_reference_t<CFunc> class_factory;
-    remove_reference_t<AFunc> alias_factory;
+  remove_reference_t<CFunc> class_factory;
+  remove_reference_t<AFunc> alias_factory;
 
-    factory(CFunc &&c, AFunc &&a)
-        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) { }
+  factory(CFunc &&c, AFunc &&a)
+      : class_factory(std::forward<CFunc>(c)),
+        alias_factory(std::forward<AFunc>(a)) {}
 
-    // The class factory is called when the `self` type passed to `__init__` is the direct
-    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
-    template <typename Class, typename... Extra>
-    void execute(Class &cl, const Extra&... extra) && {
-        static_assert(Class::has_alias, "The two-argument version of `py::init()` can "
-                                        "only be used if the class has an alias");
-        #if defined(PYBIND11_CPP14)
-        cl.def("__init__", [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
-        #else
-        auto &class_func = class_factory;
-        auto &alias_func = alias_factory;
-        cl.def("__init__", [class_func, alias_func]
-        #endif
+  // The class factory is called when the `self` type passed to `__init__` is
+  // the direct class (i.e. not inherited), the alias factory when `self` is a
+  // Python-side subtype.
+  template <typename Class, typename... Extra>
+  void execute(Class &cl, const Extra &... extra) && {
+    static_assert(Class::has_alias,
+                  "The two-argument version of `py::init()` can "
+                  "only be used if the class has an alias");
+#if defined(PYBIND11_CPP14)
+    cl.def(
+        "__init__",
+        [class_func = std::move(class_factory),
+         alias_func = std::move(alias_factory)]
+#else
+    auto &class_func = class_factory;
+    auto &alias_func = alias_factory;
+    cl.def(
+        "__init__",
+        [class_func, alias_func]
+#endif
         (value_and_holder &v_h, CArgs... args) {
-            if (Py_TYPE(v_h.inst) == v_h.type->type)
-                // If the instance type equals the registered type we don't have inheritance, so
-                // don't need the alias and can construct using the class function:
-                construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
-            else
-                construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
-        }, is_new_style_constructor(), extra...);
-    }
+          if (Py_TYPE(v_h.inst) == v_h.type->type)
+            // If the instance type equals the registered type we don't have
+            // inheritance, so don't need the alias and can construct using the
+            // class function:
+            construct<Class>(v_h, class_func(std::forward<CArgs>(args)...),
+                             false);
+          else
+            construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...),
+                             true);
+        },
+        is_new_style_constructor(), extra...);
+  }
 };
 
 /// Set just the C++ state. Same as `__init__`.
 template <typename Class, typename T>
 void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
-    construct<Class>(v_h, std::forward<T>(result), need_alias);
+  construct<Class>(v_h, std::forward<T>(result), need_alias);
 }
 
 /// Set both the C++ and Python states
 template <typename Class, typename T, typename O,
           enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
-void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
-    construct<Class>(v_h, std::move(result.first), need_alias);
-    setattr((PyObject *) v_h.inst, "__dict__", result.second);
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result,
+              bool need_alias) {
+  construct<Class>(v_h, std::move(result.first), need_alias);
+  setattr((PyObject *)v_h.inst, "__dict__", result.second);
 }
 
 /// Implementation for py::pickle(GetState, SetState)
-template <typename Get, typename Set,
-          typename = function_signature_t<Get>, typename = function_signature_t<Set>>
+template <typename Get, typename Set, typename = function_signature_t<Get>,
+          typename = function_signature_t<Set>>
 struct pickle_factory;
 
-template <typename Get, typename Set,
-          typename RetState, typename Self, typename NewInstance, typename ArgState>
+template <typename Get, typename Set, typename RetState, typename Self,
+          typename NewInstance, typename ArgState>
 struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
-    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
-                  "The type returned by `__getstate__` must be the same "
-                  "as the argument accepted by `__setstate__`");
+  static_assert(
+      std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+      "The type returned by `__getstate__` must be the same "
+      "as the argument accepted by `__setstate__`");
 
-    remove_reference_t<Get> get;
-    remove_reference_t<Set> set;
+  remove_reference_t<Get> get;
+  remove_reference_t<Set> set;
 
-    pickle_factory(Get get, Set set)
-        : get(std::forward<Get>(get)), set(std::forward<Set>(set)) { }
+  pickle_factory(Get get, Set set)
+      : get(std::forward<Get>(get)), set(std::forward<Set>(set)) {}
 
-    template <typename Class, typename... Extra>
-    void execute(Class &cl, const Extra &...extra) && {
-        cl.def("__getstate__", std::move(get));
+  template <typename Class, typename... Extra>
+  void execute(Class &cl, const Extra &... extra) && {
+    cl.def("__getstate__", std::move(get));
 
 #if defined(PYBIND11_CPP14)
-        cl.def("__setstate__", [func = std::move(set)]
+    cl.def(
+        "__setstate__",
+        [func = std::move(set)]
 #else
-        auto &func = set;
-        cl.def("__setstate__", [func]
+    auto &func = set;
+    cl.def(
+        "__setstate__",
+        [func]
 #endif
         (value_and_holder &v_h, ArgState state) {
-            setstate<Class>(v_h, func(std::forward<ArgState>(state)),
-                            Py_TYPE(v_h.inst) != v_h.type->type);
-        }, is_new_style_constructor(), extra...);
-    }
+          setstate<Class>(v_h, func(std::forward<ArgState>(state)),
+                          Py_TYPE(v_h.inst) != v_h.type->type);
+        },
+        is_new_style_constructor(), extra...);
+  }
 };
 
 NAMESPACE_END(initimpl)
diff --git a/python/src/pybind11/detail/internals.h b/python/src/pybind11/detail/internals.h
index 4f25759d3..25a068aec 100644
--- a/python/src/pybind11/detail/internals.h
+++ b/python/src/pybind11/detail/internals.h
@@ -18,276 +18,323 @@ inline PyTypeObject *make_static_property_type();
 inline PyTypeObject *make_default_metaclass();
 inline PyObject *make_object_base_type(PyTypeObject *metaclass);
 
-// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
-// Thread Specific Storage (TSS) API.
+// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in
+// favor of the new Thread Specific Storage (TSS) API.
 #if PY_VERSION_HEX >= 0x03070000
-#    define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr
-#    define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
-#    define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
-#    define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
+#define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr
+#define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
+#define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
+#define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
 #else
-    // Usually an int but a long on Cygwin64 with Python 3.x
-#    define PYBIND11_TLS_KEY_INIT(var) decltype(PyThread_create_key()) var = 0
-#    define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
-#    if PY_MAJOR_VERSION < 3
-#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
-             PyThread_delete_key_value(key)
-#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
-             do {                                                            \
-                 PyThread_delete_key_value((key));                           \
-                 PyThread_set_key_value((key), (value));                     \
-             } while (false)
-#    else
-#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
-             PyThread_set_key_value((key), nullptr)
-#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
-             PyThread_set_key_value((key), (value))
-#    endif
+// Usually an int but a long on Cygwin64 with Python 3.x
+#define PYBIND11_TLS_KEY_INIT(var) decltype(PyThread_create_key()) var = 0
+#define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
+#if PY_MAJOR_VERSION < 3
+#define PYBIND11_TLS_DELETE_VALUE(key) PyThread_delete_key_value(key)
+#define PYBIND11_TLS_REPLACE_VALUE(key, value)                                 \
+  do {                                                                         \
+    PyThread_delete_key_value((key));                                          \
+    PyThread_set_key_value((key), (value));                                    \
+  } while (false)
+#else
+#define PYBIND11_TLS_DELETE_VALUE(key) PyThread_set_key_value((key), nullptr)
+#define PYBIND11_TLS_REPLACE_VALUE(key, value)                                 \
+  PyThread_set_key_value((key), (value))
+#endif
 #endif
 
-// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
-// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module
-// even when `A` is the same, non-hidden-visibility type (e.g. from a common include).  Under
-// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name,
-// which works.  If not under a known-good stl, provide our own name-based hash and equality
-// functions that use the type name.
+// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under
+// libc++ and possibly other STLs, this means `typeid(A)` from one module won't
+// equal `typeid(A)` from another module even when `A` is the same,
+// non-hidden-visibility type (e.g. from a common include).  Under libstdc++,
+// this doesn't happen: equality and the type_index hash are based on the type
+// name, which works.  If not under a known-good stl, provide our own name-based
+// hash and equality functions that use the type name.
 #if defined(__GLIBCXX__)
-inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; }
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
+  return lhs == rhs;
+}
 using type_hash = std::hash<std::type_index>;
 using type_equal_to = std::equal_to<std::type_index>;
 #else
 inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
-    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+  return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
 }
 
 struct type_hash {
-    size_t operator()(const std::type_index &t) const {
-        size_t hash = 5381;
-        const char *ptr = t.name();
-        while (auto c = static_cast<unsigned char>(*ptr++))
-            hash = (hash * 33) ^ c;
-        return hash;
-    }
+  size_t operator()(const std::type_index &t) const {
+    size_t hash = 5381;
+    const char *ptr = t.name();
+    while (auto c = static_cast<unsigned char>(*ptr++))
+      hash = (hash * 33) ^ c;
+    return hash;
+  }
 };
 
 struct type_equal_to {
-    bool operator()(const std::type_index &lhs, const std::type_index &rhs) const {
-        return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
-    }
+  bool operator()(const std::type_index &lhs,
+                  const std::type_index &rhs) const {
+    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+  }
 };
 #endif
 
 template <typename value_type>
-using type_map = std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
+using type_map =
+    std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
 
 struct overload_hash {
-    inline size_t operator()(const std::pair<const PyObject *, const char *>& v) const {
-        size_t value = std::hash<const void *>()(v.first);
-        value ^= std::hash<const void *>()(v.second)  + 0x9e3779b9 + (value<<6) + (value>>2);
-        return value;
-    }
+  inline size_t
+  operator()(const std::pair<const PyObject *, const char *> &v) const {
+    size_t value = std::hash<const void *>()(v.first);
+    value ^= std::hash<const void *>()(v.second) + 0x9e3779b9 + (value << 6) +
+             (value >> 2);
+    return value;
+  }
 };
 
 /// Internal data structure used to track registered instances and types.
 /// Whenever binary incompatible changes are made to this structure,
 /// `PYBIND11_INTERNALS_VERSION` must be incremented.
 struct internals {
-    type_map<type_info *> registered_types_cpp; // std::type_index -> pybind11's type information
-    std::unordered_map<PyTypeObject *, std::vector<type_info *>> registered_types_py; // PyTypeObject* -> base type_info(s)
-    std::unordered_multimap<const void *, instance*> registered_instances; // void * -> instance*
-    std::unordered_set<std::pair<const PyObject *, const char *>, overload_hash> inactive_overload_cache;
-    type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
-    std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
-    std::forward_list<void (*) (std::exception_ptr)> registered_exception_translators;
-    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
-    std::vector<PyObject *> loader_patient_stack; // Used by `loader_life_support`
-    std::forward_list<std::string> static_strings; // Stores the std::strings backing detail::c_str()
-    PyTypeObject *static_property_type;
-    PyTypeObject *default_metaclass;
-    PyObject *instance_base;
+  type_map<type_info *>
+      registered_types_cpp; // std::type_index -> pybind11's type information
+  std::unordered_map<PyTypeObject *, std::vector<type_info *>>
+      registered_types_py; // PyTypeObject* -> base type_info(s)
+  std::unordered_multimap<const void *, instance *>
+      registered_instances; // void * -> instance*
+  std::unordered_set<std::pair<const PyObject *, const char *>, overload_hash>
+      inactive_overload_cache;
+  type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
+  std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
+  std::forward_list<void (*)(std::exception_ptr)>
+      registered_exception_translators;
+  std::unordered_map<std::string, void *>
+      shared_data; // Custom data to be shared across extensions
+  std::vector<PyObject *> loader_patient_stack; // Used by `loader_life_support`
+  std::forward_list<std::string>
+      static_strings; // Stores the std::strings backing detail::c_str()
+  PyTypeObject *static_property_type;
+  PyTypeObject *default_metaclass;
+  PyObject *instance_base;
 #if defined(WITH_THREAD)
-    PYBIND11_TLS_KEY_INIT(tstate);
-    PyInterpreterState *istate = nullptr;
+  PYBIND11_TLS_KEY_INIT(tstate);
+  PyInterpreterState *istate = nullptr;
 #endif
 };
 
 /// Additional type information which does not fit into the PyTypeObject.
 /// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`.
 struct type_info {
-    PyTypeObject *type;
-    const std::type_info *cpptype;
-    size_t type_size, type_align, holder_size_in_ptrs;
-    void *(*operator_new)(size_t);
-    void (*init_instance)(instance *, const void *);
-    void (*dealloc)(value_and_holder &v_h);
-    std::vector<PyObject *(*)(PyObject *, PyTypeObject *)> implicit_conversions;
-    std::vector<std::pair<const std::type_info *, void *(*)(void *)>> implicit_casts;
-    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
-    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
-    void *get_buffer_data = nullptr;
-    void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
-    /* A simple type never occurs as a (direct or indirect) parent
-     * of a class that makes use of multiple inheritance */
-    bool simple_type : 1;
-    /* True if there is no multiple inheritance in this type's inheritance tree */
-    bool simple_ancestors : 1;
-    /* for base vs derived holder_type checks */
-    bool default_holder : 1;
-    /* true if this is a type registered with py::module_local */
-    bool module_local : 1;
+  PyTypeObject *type;
+  const std::type_info *cpptype;
+  size_t type_size, type_align, holder_size_in_ptrs;
+  void *(*operator_new)(size_t);
+  void (*init_instance)(instance *, const void *);
+  void (*dealloc)(value_and_holder &v_h);
+  std::vector<PyObject *(*)(PyObject *, PyTypeObject *)> implicit_conversions;
+  std::vector<std::pair<const std::type_info *, void *(*)(void *)>>
+      implicit_casts;
+  std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
+  buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
+  void *get_buffer_data = nullptr;
+  void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
+  /* A simple type never occurs as a (direct or indirect) parent
+   * of a class that makes use of multiple inheritance */
+  bool simple_type : 1;
+  /* True if there is no multiple inheritance in this type's inheritance tree */
+  bool simple_ancestors : 1;
+  /* for base vs derived holder_type checks */
+  bool default_holder : 1;
+  /* true if this is a type registered with py::module_local */
+  bool module_local : 1;
 };
 
-/// Tracks the `internals` and `type_info` ABI version independent of the main library version
+/// Tracks the `internals` and `type_info` ABI version independent of the main
+/// library version
 #define PYBIND11_INTERNALS_VERSION 3
 
 #if defined(_DEBUG)
-#   define PYBIND11_BUILD_TYPE "_debug"
+#define PYBIND11_BUILD_TYPE "_debug"
 #else
-#   define PYBIND11_BUILD_TYPE ""
+#define PYBIND11_BUILD_TYPE ""
 #endif
 
 #if defined(WITH_THREAD)
-#  define PYBIND11_INTERNALS_KIND ""
+#define PYBIND11_INTERNALS_KIND ""
 #else
-#  define PYBIND11_INTERNALS_KIND "_without_thread"
+#define PYBIND11_INTERNALS_KIND "_without_thread"
 #endif
 
-#define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \
-    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__"
+#define PYBIND11_INTERNALS_ID                                                  \
+  "__pybind11_internals_v" PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION)       \
+      PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__"
 
-#define PYBIND11_MODULE_LOCAL_ID "__pybind11_module_local_v" \
-    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__"
+#define PYBIND11_MODULE_LOCAL_ID                                               \
+  "__pybind11_module_local_v" PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION)    \
+      PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__"
 
 /// Each module locally stores a pointer to the `internals` data. The data
 /// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
 inline internals **&get_internals_pp() {
-    static internals **internals_pp = nullptr;
-    return internals_pp;
+  static internals **internals_pp = nullptr;
+  return internals_pp;
 }
 
 /// Return a reference to the current `internals` data
 PYBIND11_NOINLINE inline internals &get_internals() {
-    auto **&internals_pp = get_internals_pp();
-    if (internals_pp && *internals_pp)
-        return **internals_pp;
-
-    constexpr auto *id = PYBIND11_INTERNALS_ID;
-    auto builtins = handle(PyEval_GetBuiltins());
-    if (builtins.contains(id) && isinstance<capsule>(builtins[id])) {
-        internals_pp = static_cast<internals **>(capsule(builtins[id]));
-
-        // We loaded builtins through python's builtins, which means that our `error_already_set`
-        // and `builtin_exception` may be different local classes than the ones set up in the
-        // initial exception translator, below, so add another for our local exception classes.
-        //
-        // libstdc++ doesn't require this (types there are identified only by name)
-#if !defined(__GLIBCXX__)
-        (*internals_pp)->registered_exception_translators.push_front(
-            [](std::exception_ptr p) -> void {
-                try {
-                    if (p) std::rethrow_exception(p);
-                } catch (error_already_set &e)       { e.restore();   return;
-                } catch (const builtin_exception &e) { e.set_error(); return;
-                }
-            }
-        );
-#endif
-    } else {
-        if (!internals_pp) internals_pp = new internals*();
-        auto *&internals_ptr = *internals_pp;
-        internals_ptr = new internals();
-#if defined(WITH_THREAD)
-        #if PY_VERSION_HEX < 0x03090000
-        PyEval_InitThreads();
-        #endif
-        PyThreadState *tstate = PyThreadState_Get();
-        #if PY_VERSION_HEX >= 0x03070000
-            internals_ptr->tstate = PyThread_tss_alloc();
-            if (!internals_ptr->tstate || PyThread_tss_create(internals_ptr->tstate))
-                pybind11_fail("get_internals: could not successfully initialize the TSS key!");
-            PyThread_tss_set(internals_ptr->tstate, tstate);
-        #else
-            internals_ptr->tstate = PyThread_create_key();
-            if (internals_ptr->tstate == -1)
-                pybind11_fail("get_internals: could not successfully initialize the TLS key!");
-            PyThread_set_key_value(internals_ptr->tstate, tstate);
-        #endif
-        internals_ptr->istate = tstate->interp;
-#endif
-        builtins[id] = capsule(internals_pp);
-        internals_ptr->registered_exception_translators.push_front(
-            [](std::exception_ptr p) -> void {
-                try {
-                    if (p) std::rethrow_exception(p);
-                } catch (error_already_set &e)           { e.restore();                                    return;
-                } catch (const builtin_exception &e)     { e.set_error();                                  return;
-                } catch (const std::bad_alloc &e)        { PyErr_SetString(PyExc_MemoryError,   e.what()); return;
-                } catch (const std::domain_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-                } catch (const std::invalid_argument &e) { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-                } catch (const std::length_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-                } catch (const std::out_of_range &e)     { PyErr_SetString(PyExc_IndexError,    e.what()); return;
-                } catch (const std::range_error &e)      { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-                } catch (const std::exception &e)        { PyErr_SetString(PyExc_RuntimeError,  e.what()); return;
-                } catch (...) {
-                    PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!");
-                    return;
-                }
-            }
-        );
-        internals_ptr->static_property_type = make_static_property_type();
-        internals_ptr->default_metaclass = make_default_metaclass();
-        internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
-    }
+  auto **&internals_pp = get_internals_pp();
+  if (internals_pp && *internals_pp)
     return **internals_pp;
+
+  constexpr auto *id = PYBIND11_INTERNALS_ID;
+  auto builtins = handle(PyEval_GetBuiltins());
+  if (builtins.contains(id) && isinstance<capsule>(builtins[id])) {
+    internals_pp = static_cast<internals **>(capsule(builtins[id]));
+
+    // We loaded builtins through python's builtins, which means that our
+    // `error_already_set` and `builtin_exception` may be different local
+    // classes than the ones set up in the initial exception translator, below,
+    // so add another for our local exception classes.
+    //
+    // libstdc++ doesn't require this (types there are identified only by name)
+#if !defined(__GLIBCXX__)
+    (*internals_pp)
+        ->registered_exception_translators.push_front(
+            [](std::exception_ptr p) -> void {
+              try {
+                if (p)
+                  std::rethrow_exception(p);
+              } catch (error_already_set &e) {
+                e.restore();
+                return;
+              } catch (const builtin_exception &e) {
+                e.set_error();
+                return;
+              }
+            });
+#endif
+  } else {
+    if (!internals_pp)
+      internals_pp = new internals *();
+    auto *&internals_ptr = *internals_pp;
+    internals_ptr = new internals();
+#if defined(WITH_THREAD)
+#if PY_VERSION_HEX < 0x03090000
+    PyEval_InitThreads();
+#endif
+    PyThreadState *tstate = PyThreadState_Get();
+#if PY_VERSION_HEX >= 0x03070000
+    internals_ptr->tstate = PyThread_tss_alloc();
+    if (!internals_ptr->tstate || PyThread_tss_create(internals_ptr->tstate))
+      pybind11_fail(
+          "get_internals: could not successfully initialize the TSS key!");
+    PyThread_tss_set(internals_ptr->tstate, tstate);
+#else
+    internals_ptr->tstate = PyThread_create_key();
+    if (internals_ptr->tstate == -1)
+      pybind11_fail(
+          "get_internals: could not successfully initialize the TLS key!");
+    PyThread_set_key_value(internals_ptr->tstate, tstate);
+#endif
+    internals_ptr->istate = tstate->interp;
+#endif
+    builtins[id] = capsule(internals_pp);
+    internals_ptr->registered_exception_translators.push_front(
+        [](std::exception_ptr p) -> void {
+          try {
+            if (p)
+              std::rethrow_exception(p);
+          } catch (error_already_set &e) {
+            e.restore();
+            return;
+          } catch (const builtin_exception &e) {
+            e.set_error();
+            return;
+          } catch (const std::bad_alloc &e) {
+            PyErr_SetString(PyExc_MemoryError, e.what());
+            return;
+          } catch (const std::domain_error &e) {
+            PyErr_SetString(PyExc_ValueError, e.what());
+            return;
+          } catch (const std::invalid_argument &e) {
+            PyErr_SetString(PyExc_ValueError, e.what());
+            return;
+          } catch (const std::length_error &e) {
+            PyErr_SetString(PyExc_ValueError, e.what());
+            return;
+          } catch (const std::out_of_range &e) {
+            PyErr_SetString(PyExc_IndexError, e.what());
+            return;
+          } catch (const std::range_error &e) {
+            PyErr_SetString(PyExc_ValueError, e.what());
+            return;
+          } catch (const std::exception &e) {
+            PyErr_SetString(PyExc_RuntimeError, e.what());
+            return;
+          } catch (...) {
+            PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!");
+            return;
+          }
+        });
+    internals_ptr->static_property_type = make_static_property_type();
+    internals_ptr->default_metaclass = make_default_metaclass();
+    internals_ptr->instance_base =
+        make_object_base_type(internals_ptr->default_metaclass);
+  }
+  return **internals_pp;
 }
 
-/// Works like `internals.registered_types_cpp`, but for module-local registered types:
+/// Works like `internals.registered_types_cpp`, but for module-local registered
+/// types:
 inline type_map<type_info *> &registered_local_types_cpp() {
-    static type_map<type_info *> locals{};
-    return locals;
+  static type_map<type_info *> locals{};
+  return locals;
 }
 
-/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
-/// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
-/// cleared when the program exits or after interpreter shutdown (when embedding), and so are
-/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name).
-template <typename... Args>
-const char *c_str(Args &&...args) {
-    auto &strings = get_internals().static_strings;
-    strings.emplace_front(std::forward<Args>(args)...);
-    return strings.front().c_str();
+/// Constructs a std::string with the given arguments, stores it in `internals`,
+/// and returns its `c_str()`.  Such strings objects have a long storage
+/// duration -- the internal strings are only cleared when the program exits or
+/// after interpreter shutdown (when embedding), and so are suitable for c-style
+/// strings needed by Python internals (such as PyTypeObject's tp_name).
+template <typename... Args> const char *c_str(Args &&... args) {
+  auto &strings = get_internals().static_strings;
+  strings.emplace_front(std::forward<Args>(args)...);
+  return strings.front().c_str();
 }
 
 NAMESPACE_END(detail)
 
-/// Returns a named pointer that is shared among all extension modules (using the same
-/// pybind11 version) running in the current interpreter. Names starting with underscores
-/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
+/// Returns a named pointer that is shared among all extension modules (using
+/// the same pybind11 version) running in the current interpreter. Names
+/// starting with underscores are reserved for internal usage. Returns `nullptr`
+/// if no matching entry was found.
 inline PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
-    auto &internals = detail::get_internals();
-    auto it = internals.shared_data.find(name);
-    return it != internals.shared_data.end() ? it->second : nullptr;
+  auto &internals = detail::get_internals();
+  auto it = internals.shared_data.find(name);
+  return it != internals.shared_data.end() ? it->second : nullptr;
 }
 
 /// Set the shared data that can be later recovered by `get_shared_data()`.
-inline PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
-    detail::get_internals().shared_data[name] = data;
-    return data;
+inline PYBIND11_NOINLINE void *set_shared_data(const std::string &name,
+                                               void *data) {
+  detail::get_internals().shared_data[name] = data;
+  return data;
 }
 
-/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
-/// such entry exists. Otherwise, a new object of default-constructible type `T` is
-/// added to the shared data under the given name and a reference to it is returned.
-template<typename T>
-T &get_or_create_shared_data(const std::string &name) {
-    auto &internals = detail::get_internals();
-    auto it = internals.shared_data.find(name);
-    T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr);
-    if (!ptr) {
-        ptr = new T();
-        internals.shared_data[name] = ptr;
-    }
-    return *ptr;
+/// Returns a typed reference to a shared data entry (by using
+/// `get_shared_data()`) if such entry exists. Otherwise, a new object of
+/// default-constructible type `T` is added to the shared data under the given
+/// name and a reference to it is returned.
+template <typename T> T &get_or_create_shared_data(const std::string &name) {
+  auto &internals = detail::get_internals();
+  auto it = internals.shared_data.find(name);
+  T *ptr = (T *)(it != internals.shared_data.end() ? it->second : nullptr);
+  if (!ptr) {
+    ptr = new T();
+    internals.shared_data[name] = ptr;
+  }
+  return *ptr;
 }
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/detail/typeid.h b/python/src/pybind11/detail/typeid.h
index 9c8a4fc69..9c0589b53 100644
--- a/python/src/pybind11/detail/typeid.h
+++ b/python/src/pybind11/detail/typeid.h
@@ -22,34 +22,35 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 /// Erase all occurrences of a substring
 inline void erase_all(std::string &string, const std::string &search) {
-    for (size_t pos = 0;;) {
-        pos = string.find(search, pos);
-        if (pos == std::string::npos) break;
-        string.erase(pos, search.length());
-    }
+  for (size_t pos = 0;;) {
+    pos = string.find(search, pos);
+    if (pos == std::string::npos)
+      break;
+    string.erase(pos, search.length());
+  }
 }
 
 PYBIND11_NOINLINE inline void clean_type_id(std::string &name) {
 #if defined(__GNUG__)
-    int status = 0;
-    std::unique_ptr<char, void (*)(void *)> res {
-        abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free };
-    if (status == 0)
-        name = res.get();
+  int status = 0;
+  std::unique_ptr<char, void (*)(void *)> res{
+      abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free};
+  if (status == 0)
+    name = res.get();
 #else
-    detail::erase_all(name, "class ");
-    detail::erase_all(name, "struct ");
-    detail::erase_all(name, "enum ");
+  detail::erase_all(name, "class ");
+  detail::erase_all(name, "struct ");
+  detail::erase_all(name, "enum ");
 #endif
-    detail::erase_all(name, "pybind11::");
+  detail::erase_all(name, "pybind11::");
 }
 NAMESPACE_END(detail)
 
 /// Return a string representation of a C++ type
 template <typename T> static std::string type_id() {
-    std::string name(typeid(T).name());
-    detail::clean_type_id(name);
-    return name;
+  std::string name(typeid(T).name());
+  detail::clean_type_id(name);
+  return name;
 }
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/eigen.h b/python/src/pybind11/eigen.h
index d963d9650..4416f91b1 100644
--- a/python/src/pybind11/eigen.h
+++ b/python/src/pybind11/eigen.h
@@ -12,596 +12,712 @@
 #include "numpy.h"
 
 #if defined(__INTEL_COMPILER)
-#  pragma warning(disable: 1682) // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#pragma warning(                                                               \
+    disable : 1682) // implicit conversion of a 64-bit integral type to a
+                    // smaller integral type (potential portability problem)
 #elif defined(__GNUG__) || defined(__clang__)
-#  pragma GCC diagnostic push
-#  pragma GCC diagnostic ignored "-Wconversion"
-#  pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#  ifdef __clang__
-//   Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings with -Wdeprecated
-//   under Clang, so disable that warning here:
-#    pragma GCC diagnostic ignored "-Wdeprecated"
-#  endif
-#  if __GNUC__ >= 7
-#    pragma GCC diagnostic ignored "-Wint-in-bool-context"
-#  endif
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#ifdef __clang__
+//   Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings
+//   with -Wdeprecated under Clang, so disable that warning here:
+#pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#endif
 #endif
 
 #if defined(_MSC_VER)
-#  pragma warning(push)
-#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
-#  pragma warning(disable: 4996) // warning C4996: std::unary_negate is deprecated in C++17
+#pragma warning(push)
+#pragma warning(                                                               \
+    disable : 4127) // warning C4127: Conditional expression is constant
+#pragma warning(                                                               \
+    disable : 4996) // warning C4996: std::unary_negate is deprecated in C++17
 #endif
 
 #include <Eigen/Core>
 #include <Eigen/SparseCore>
 
-// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
-// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
-// of matrices seems highly undesirable.
-static_assert(EIGEN_VERSION_AT_LEAST(3,2,7), "Eigen support in pybind11 requires Eigen >= 3.2.7");
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some
+// classes get implicit move constructors that break things.  We could detect
+// this an explicitly copy, but an extra copy of matrices seems highly
+// undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3, 2, 7),
+              "Eigen support in pybind11 requires Eigen >= 3.2.7");
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
-// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic
+// strides:
 using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
-template <typename MatrixType> using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
-template <typename MatrixType> using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+template <typename MatrixType>
+using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType>
+using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
 
 NAMESPACE_BEGIN(detail)
 
-#if EIGEN_VERSION_AT_LEAST(3,3,0)
+#if EIGEN_VERSION_AT_LEAST(3, 3, 0)
 using EigenIndex = Eigen::Index;
 #else
 using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
 #endif
 
 // Matches Eigen::Map, Eigen::Ref, blocks, etc:
-template <typename T> using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>, std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
-template <typename T> using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
-template <typename T> using is_eigen_dense_plain = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
-template <typename T> using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
-// Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
-// basically covers anything that can be assigned to a dense matrix but that don't have a typical
-// matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
-// SelfAdjointView fall into this category.
-template <typename T> using is_eigen_other = all_of<
-    is_template_base_of<Eigen::EigenBase, T>,
-    negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>
->;
+template <typename T>
+using is_eigen_dense_map =
+    all_of<is_template_base_of<Eigen::DenseBase, T>,
+           std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T>
+using is_eigen_mutable_map =
+    std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T>
+using is_eigen_dense_plain =
+    all_of<negation<is_eigen_dense_map<T>>,
+           is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T>
+using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
+// Test for objects inheriting from EigenBase<Derived> that aren't captured by
+// the above.  This basically covers anything that can be assigned to a dense
+// matrix but that don't have a typical matrix data layout that can be copied
+// from their .data().  For example, DiagonalMatrix and SelfAdjointView fall
+// into this category.
+template <typename T>
+using is_eigen_other =
+    all_of<is_template_base_of<Eigen::EigenBase, T>,
+           negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>,
+                           is_eigen_sparse<T>>>>;
 
-// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+// Captures numpy/eigen conformability status (returned by
+// EigenProps::conformable()):
 template <bool EigenRowMajor> struct EigenConformable {
-    bool conformable = false;
-    EigenIndex rows = 0, cols = 0;
-    EigenDStride stride{0, 0};      // Only valid if negativestrides is false!
-    bool negativestrides = false;   // If true, do not use stride!
+  bool conformable = false;
+  EigenIndex rows = 0, cols = 0;
+  EigenDStride stride{0, 0};    // Only valid if negativestrides is false!
+  bool negativestrides = false; // If true, do not use stride!
 
-    EigenConformable(bool fits = false) : conformable{fits} {}
-    // Matrix type:
-    EigenConformable(EigenIndex r, EigenIndex c,
-            EigenIndex rstride, EigenIndex cstride) :
-        conformable{true}, rows{r}, cols{c} {
-        // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
-        if (rstride < 0 || cstride < 0) {
-            negativestrides = true;
-        } else {
-            stride = {EigenRowMajor ? rstride : cstride /* outer stride */,
-                      EigenRowMajor ? cstride : rstride /* inner stride */ };
-        }
+  EigenConformable(bool fits = false) : conformable{fits} {}
+  // Matrix type:
+  EigenConformable(EigenIndex r, EigenIndex c, EigenIndex rstride,
+                   EigenIndex cstride)
+      : conformable{true}, rows{r}, cols{c} {
+    // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity.
+    // http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+    if (rstride < 0 || cstride < 0) {
+      negativestrides = true;
+    } else {
+      stride = {EigenRowMajor ? rstride : cstride /* outer stride */,
+                EigenRowMajor ? cstride : rstride /* inner stride */};
     }
-    // Vector type:
-    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
-        : EigenConformable(r, c, r == 1 ? c*stride : stride, c == 1 ? r : r*stride) {}
+  }
+  // Vector type:
+  EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+      : EigenConformable(r, c, r == 1 ? c * stride : stride,
+                         c == 1 ? r : r * stride) {}
 
-    template <typename props> bool stride_compatible() const {
-        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
-        // matching strides, or a dimension size of 1 (in which case the stride value is irrelevant)
-        return
-            !negativestrides &&
-            (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner() ||
-                (EigenRowMajor ? cols : rows) == 1) &&
-            (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer() ||
-                (EigenRowMajor ? rows : cols) == 1);
-    }
-    operator bool() const { return conformable; }
+  template <typename props> bool stride_compatible() const {
+    // To have compatible strides, we need (on both dimensions) one of fully
+    // dynamic strides, matching strides, or a dimension size of 1 (in which
+    // case the stride value is irrelevant)
+    return !negativestrides &&
+           (props::inner_stride == Eigen::Dynamic ||
+            props::inner_stride == stride.inner() ||
+            (EigenRowMajor ? cols : rows) == 1) &&
+           (props::outer_stride == Eigen::Dynamic ||
+            props::outer_stride == stride.outer() ||
+            (EigenRowMajor ? rows : cols) == 1);
+  }
+  operator bool() const { return conformable; }
 };
 
 template <typename Type> struct eigen_extract_stride { using type = Type; };
 template <typename PlainObjectType, int MapOptions, typename StrideType>
-struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> { using type = StrideType; };
+struct eigen_extract_stride<
+    Eigen::Map<PlainObjectType, MapOptions, StrideType>> {
+  using type = StrideType;
+};
 template <typename PlainObjectType, int Options, typename StrideType>
-struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> { using type = StrideType; };
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> {
+  using type = StrideType;
+};
 
 // Helper struct for extracting information from an Eigen type
 template <typename Type_> struct EigenProps {
-    using Type = Type_;
-    using Scalar = typename Type::Scalar;
-    using StrideType = typename eigen_extract_stride<Type>::type;
-    static constexpr EigenIndex
-        rows = Type::RowsAtCompileTime,
-        cols = Type::ColsAtCompileTime,
-        size = Type::SizeAtCompileTime;
-    static constexpr bool
-        row_major = Type::IsRowMajor,
-        vector = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
-        fixed_rows = rows != Eigen::Dynamic,
-        fixed_cols = cols != Eigen::Dynamic,
-        fixed = size != Eigen::Dynamic, // Fully-fixed size
-        dynamic = !fixed_rows && !fixed_cols; // Fully-dynamic size
+  using Type = Type_;
+  using Scalar = typename Type::Scalar;
+  using StrideType = typename eigen_extract_stride<Type>::type;
+  static constexpr EigenIndex rows = Type::RowsAtCompileTime,
+                              cols = Type::ColsAtCompileTime,
+                              size = Type::SizeAtCompileTime;
+  static constexpr bool
+      row_major = Type::IsRowMajor,
+      vector = Type::IsVectorAtCompileTime, // At least one dimension has fixed
+                                            // size 1
+      fixed_rows = rows != Eigen::Dynamic, fixed_cols = cols != Eigen::Dynamic,
+      fixed = size != Eigen::Dynamic,       // Fully-fixed size
+      dynamic = !fixed_rows && !fixed_cols; // Fully-dynamic size
 
-    template <EigenIndex i, EigenIndex ifzero> using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
-    static constexpr EigenIndex inner_stride = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
-                                outer_stride = if_zero<StrideType::OuterStrideAtCompileTime,
-                                                       vector ? size : row_major ? cols : rows>::value;
-    static constexpr bool dynamic_stride = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
-    static constexpr bool requires_row_major = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
-    static constexpr bool requires_col_major = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+  template <EigenIndex i, EigenIndex ifzero>
+  using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+  static constexpr EigenIndex
+      inner_stride = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+      outer_stride = if_zero < StrideType::OuterStrideAtCompileTime,
+      vector ? size : row_major ? cols : rows > ::value;
+  static constexpr bool dynamic_stride =
+      inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+  static constexpr bool requires_row_major =
+      !dynamic_stride && !vector &&
+      (row_major ? inner_stride : outer_stride) == 1;
+  static constexpr bool requires_col_major =
+      !dynamic_stride && !vector &&
+      (row_major ? outer_stride : inner_stride) == 1;
 
-    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
-    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
-    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
-    static EigenConformable<row_major> conformable(const array &a) {
-        const auto dims = a.ndim();
-        if (dims < 1 || dims > 2)
-            return false;
+  // Takes an input array and determines whether we can make it fit into the
+  // Eigen type.  If the array is a vector, we attempt to fit it into either an
+  // Eigen 1xN or Nx1 vector (preferring the latter if it will fit in either,
+  // i.e. for a fully dynamic matrix type).
+  static EigenConformable<row_major> conformable(const array &a) {
+    const auto dims = a.ndim();
+    if (dims < 1 || dims > 2)
+      return false;
 
-        if (dims == 2) { // Matrix type: require exact match (or dynamic)
+    if (dims == 2) { // Matrix type: require exact match (or dynamic)
 
-            EigenIndex
-                np_rows = a.shape(0),
-                np_cols = a.shape(1),
-                np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
-                np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
-            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols))
-                return false;
+      EigenIndex np_rows = a.shape(0), np_cols = a.shape(1),
+                 np_rstride =
+                     a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                 np_cstride =
+                     a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+      if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols))
+        return false;
 
-            return {np_rows, np_cols, np_rstride, np_cstride};
-        }
-
-        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but whichever
-        // is used, we want the (single) numpy stride value.
-        const EigenIndex n = a.shape(0),
-              stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
-
-        if (vector) { // Eigen type is a compile-time vector
-            if (fixed && size != n)
-                return false; // Vector size mismatch
-            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
-        }
-        else if (fixed) {
-            // The type has a fixed size, but is not a vector: abort
-            return false;
-        }
-        else if (fixed_cols) {
-            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
-            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
-            if (cols != n) return false;
-            return {1, n, stride};
-        }
-        else {
-            // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
-            if (fixed_rows && rows != n) return false;
-            return {n, 1, stride};
-        }
+      return {np_rows, np_cols, np_rstride, np_cstride};
     }
 
-    static constexpr bool show_writeable = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
-    static constexpr bool show_order = is_eigen_dense_map<Type>::value;
-    static constexpr bool show_c_contiguous = show_order && requires_row_major;
-    static constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major;
+    // Otherwise we're storing an n-vector.  Only one of the strides will be
+    // used, but whichever is used, we want the (single) numpy stride value.
+    const EigenIndex n = a.shape(0),
+                     stride =
+                         a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
 
-    static constexpr auto descriptor =
-        _("numpy.ndarray[") + npy_format_descriptor<Scalar>::name +
-        _("[")  + _<fixed_rows>(_<(size_t) rows>(), _("m")) +
-        _(", ") + _<fixed_cols>(_<(size_t) cols>(), _("n")) +
-        _("]") +
-        // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to be
-        // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride
-        // options, possibly f_contiguous or c_contiguous.  We include them in the descriptor output
-        // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to
-        // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you
-        // *gave* a numpy.ndarray of the right type and dimensions.
-        _<show_writeable>(", flags.writeable", "") +
-        _<show_c_contiguous>(", flags.c_contiguous", "") +
-        _<show_f_contiguous>(", flags.f_contiguous", "") +
-        _("]");
+    if (vector) { // Eigen type is a compile-time vector
+      if (fixed && size != n)
+        return false; // Vector size mismatch
+      return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+    } else if (fixed) {
+      // The type has a fixed size, but is not a vector: abort
+      return false;
+    } else if (fixed_cols) {
+      // Since this isn't a vector, cols must be != 1.  We allow this only if it
+      // exactly equals the number of elements (rows is Dynamic, and so 1 row is
+      // allowed).
+      if (cols != n)
+        return false;
+      return {1, n, stride};
+    } else {
+      // Otherwise it's either fully dynamic, or column dynamic; both become a
+      // column vector
+      if (fixed_rows && rows != n)
+        return false;
+      return {n, 1, stride};
+    }
+  }
+
+  static constexpr bool show_writeable =
+      is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+  static constexpr bool show_order = is_eigen_dense_map<Type>::value;
+  static constexpr bool show_c_contiguous = show_order && requires_row_major;
+  static constexpr bool show_f_contiguous =
+      !show_c_contiguous && show_order && requires_col_major;
+
+  static constexpr auto descriptor =
+      _("numpy.ndarray[") + npy_format_descriptor<Scalar>::name + _("[") +
+      _<fixed_rows>(_<(size_t)rows>(), _("m")) + _(", ") +
+      _<fixed_cols>(_<(size_t)cols>(), _("n")) + _("]") +
+      // For a reference type (e.g. Ref<MatrixXd>) we have other constraints
+      // that might need to be satisfied: writeable=True (for a mutable
+      // reference), and, depending on the map's stride options, possibly
+      // f_contiguous or c_contiguous.  We include them in the descriptor output
+      // to provide some hint as to why a TypeError is occurring (otherwise it
+      // can be confusing to see that a function accepts a
+      // 'numpy.ndarray[float64[3,2]]' and an error message that you *gave* a
+      // numpy.ndarray of the right type and dimensions.
+      _<show_writeable>(", flags.writeable", "") +
+      _<show_c_contiguous>(", flags.c_contiguous", "") +
+      _<show_f_contiguous>(", flags.f_contiguous", "") + _("]");
 };
 
-// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
-// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
-template <typename props> handle eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
-    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
-    array a;
-    if (props::vector)
-        a = array({ src.size() }, { elem_size * src.innerStride() }, src.data(), base);
-    else
-        a = array({ src.rows(), src.cols() }, { elem_size * src.rowStride(), elem_size * src.colStride() },
-                  src.data(), base);
+// Casts an Eigen type to numpy array.  If given a base, the numpy array
+// references the src data, otherwise it'll make a copy.  writeable lets you
+// turn off the writeable flag for the array.
+template <typename props>
+handle eigen_array_cast(typename props::Type const &src, handle base = handle(),
+                        bool writeable = true) {
+  constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+  array a;
+  if (props::vector)
+    a = array({src.size()}, {elem_size * src.innerStride()}, src.data(), base);
+  else
+    a = array({src.rows(), src.cols()},
+              {elem_size * src.rowStride(), elem_size * src.colStride()},
+              src.data(), base);
 
-    if (!writeable)
-        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+  if (!writeable)
+    array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
 
-    return a.release();
+  return a.release();
 }
 
-// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
-// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
-// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
-// non-writeable if the given type is const.
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a
+// numpy array that reference the Eigen object's data with `base` as the
+// python-registered base class (if omitted, the base will be set to None, and
+// lifetime management is up to the caller).  The numpy array is non-writeable
+// if the given type is const.
 template <typename props, typename Type>
 handle eigen_ref_array(Type &src, handle parent = none()) {
-    // none here is to get past array's should-we-copy detection, which currently always
-    // copies when there is no base.  Setting the base to None should be harmless.
-    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+  // none here is to get past array's should-we-copy detection, which currently
+  // always copies when there is no base.  Setting the base to None should be
+  // harmless.
+  return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
 }
 
-// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a numpy
-// array that references the encapsulated data with a python-side reference to the capsule to tie
-// its destruction to that of any dependent python objects.  Const-ness is determined by whether or
-// not the Type of the pointer given is const.
-template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it,
+// then returns a numpy array that references the encapsulated data with a
+// python-side reference to the capsule to tie its destruction to that of any
+// dependent python objects.  Const-ness is determined by whether or not the
+// Type of the pointer given is const.
+template <typename props, typename Type,
+          typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
 handle eigen_encapsulate(Type *src) {
-    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
-    return eigen_ref_array<props>(*src, base);
+  capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+  return eigen_ref_array<props>(*src, base);
 }
 
-// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
-// types.
-template<typename Type>
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not
+// maps/refs/etc. of dense types.
+template <typename Type>
 struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
-    using Scalar = typename Type::Scalar;
-    using props = EigenProps<Type>;
+  using Scalar = typename Type::Scalar;
+  using props = EigenProps<Type>;
 
-    bool load(handle src, bool convert) {
-        // If we're in no-convert mode, only load if given an array of the correct type
-        if (!convert && !isinstance<array_t<Scalar>>(src))
-            return false;
+  bool load(handle src, bool convert) {
+    // If we're in no-convert mode, only load if given an array of the correct
+    // type
+    if (!convert && !isinstance<array_t<Scalar>>(src))
+      return false;
 
-        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
-        auto buf = array::ensure(src);
+    // Coerce into an array, but don't do type conversion yet; the copy below
+    // handles it.
+    auto buf = array::ensure(src);
 
-        if (!buf)
-            return false;
+    if (!buf)
+      return false;
 
-        auto dims = buf.ndim();
-        if (dims < 1 || dims > 2)
-            return false;
+    auto dims = buf.ndim();
+    if (dims < 1 || dims > 2)
+      return false;
 
-        auto fits = props::conformable(buf);
-        if (!fits)
-            return false;
+    auto fits = props::conformable(buf);
+    if (!fits)
+      return false;
 
-        // Allocate the new type, then build a numpy reference into it
-        value = Type(fits.rows, fits.cols);
-        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
-        if (dims == 1) ref = ref.squeeze();
-        else if (ref.ndim() == 1) buf = buf.squeeze();
+    // Allocate the new type, then build a numpy reference into it
+    value = Type(fits.rows, fits.cols);
+    auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+    if (dims == 1)
+      ref = ref.squeeze();
+    else if (ref.ndim() == 1)
+      buf = buf.squeeze();
 
-        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+    int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
 
-        if (result < 0) { // Copy failed!
-            PyErr_Clear();
-            return false;
-        }
-
-        return true;
+    if (result < 0) { // Copy failed!
+      PyErr_Clear();
+      return false;
     }
 
+    return true;
+  }
+
 private:
-
-    // Cast implementation
-    template <typename CType>
-    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
-        switch (policy) {
-            case return_value_policy::take_ownership:
-            case return_value_policy::automatic:
-                return eigen_encapsulate<props>(src);
-            case return_value_policy::move:
-                return eigen_encapsulate<props>(new CType(std::move(*src)));
-            case return_value_policy::copy:
-                return eigen_array_cast<props>(*src);
-            case return_value_policy::reference:
-            case return_value_policy::automatic_reference:
-                return eigen_ref_array<props>(*src);
-            case return_value_policy::reference_internal:
-                return eigen_ref_array<props>(*src, parent);
-            default:
-                throw cast_error("unhandled return_value_policy: should not happen!");
-        };
-    }
+  // Cast implementation
+  template <typename CType>
+  static handle cast_impl(CType *src, return_value_policy policy,
+                          handle parent) {
+    switch (policy) {
+    case return_value_policy::take_ownership:
+    case return_value_policy::automatic:
+      return eigen_encapsulate<props>(src);
+    case return_value_policy::move:
+      return eigen_encapsulate<props>(new CType(std::move(*src)));
+    case return_value_policy::copy:
+      return eigen_array_cast<props>(*src);
+    case return_value_policy::reference:
+    case return_value_policy::automatic_reference:
+      return eigen_ref_array<props>(*src);
+    case return_value_policy::reference_internal:
+      return eigen_ref_array<props>(*src, parent);
+    default:
+      throw cast_error("unhandled return_value_policy: should not happen!");
+    };
+  }
 
 public:
+  // Normal returned non-reference, non-const value:
+  static handle cast(Type &&src, return_value_policy /* policy */,
+                     handle parent) {
+    return cast_impl(&src, return_value_policy::move, parent);
+  }
+  // If you return a non-reference const, we mark the numpy array readonly:
+  static handle cast(const Type &&src, return_value_policy /* policy */,
+                     handle parent) {
+    return cast_impl(&src, return_value_policy::move, parent);
+  }
+  // lvalue reference return; default (automatic) becomes copy
+  static handle cast(Type &src, return_value_policy policy, handle parent) {
+    if (policy == return_value_policy::automatic ||
+        policy == return_value_policy::automatic_reference)
+      policy = return_value_policy::copy;
+    return cast_impl(&src, policy, parent);
+  }
+  // const lvalue reference return; default (automatic) becomes copy
+  static handle cast(const Type &src, return_value_policy policy,
+                     handle parent) {
+    if (policy == return_value_policy::automatic ||
+        policy == return_value_policy::automatic_reference)
+      policy = return_value_policy::copy;
+    return cast(&src, policy, parent);
+  }
+  // non-const pointer return
+  static handle cast(Type *src, return_value_policy policy, handle parent) {
+    return cast_impl(src, policy, parent);
+  }
+  // const pointer return
+  static handle cast(const Type *src, return_value_policy policy,
+                     handle parent) {
+    return cast_impl(src, policy, parent);
+  }
 
-    // Normal returned non-reference, non-const value:
-    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
-        return cast_impl(&src, return_value_policy::move, parent);
-    }
-    // If you return a non-reference const, we mark the numpy array readonly:
-    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
-        return cast_impl(&src, return_value_policy::move, parent);
-    }
-    // lvalue reference return; default (automatic) becomes copy
-    static handle cast(Type &src, return_value_policy policy, handle parent) {
-        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
-            policy = return_value_policy::copy;
-        return cast_impl(&src, policy, parent);
-    }
-    // const lvalue reference return; default (automatic) becomes copy
-    static handle cast(const Type &src, return_value_policy policy, handle parent) {
-        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
-            policy = return_value_policy::copy;
-        return cast(&src, policy, parent);
-    }
-    // non-const pointer return
-    static handle cast(Type *src, return_value_policy policy, handle parent) {
-        return cast_impl(src, policy, parent);
-    }
-    // const pointer return
-    static handle cast(const Type *src, return_value_policy policy, handle parent) {
-        return cast_impl(src, policy, parent);
-    }
+  static constexpr auto name = props::descriptor;
 
-    static constexpr auto name = props::descriptor;
-
-    operator Type*() { return &value; }
-    operator Type&() { return value; }
-    operator Type&&() && { return std::move(value); }
-    template <typename T> using cast_op_type = movable_cast_op_type<T>;
+  operator Type *() { return &value; }
+  operator Type &() { return value; }
+  operator Type &&() && { return std::move(value); }
+  template <typename T> using cast_op_type = movable_cast_op_type<T>;
 
 private:
-    Type value;
+  Type value;
 };
 
 // Base class for casting reference/map/block/etc. objects back to python.
 template <typename MapType> struct eigen_map_caster {
 private:
-    using props = EigenProps<MapType>;
+  using props = EigenProps<MapType>;
 
 public:
-
-    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
-    // to stay around), but we'll allow it under the assumption that you know what you're doing (and
-    // have an appropriate keep_alive in place).  We return a numpy array pointing directly at the
-    // ref's data (The numpy array ends up read-only if the ref was to a const matrix type.) Note
-    // that this means you need to ensure you don't destroy the object in some other way (e.g. with
-    // an appropriate keep_alive, or with a reference to a statically allocated matrix).
-    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
-        switch (policy) {
-            case return_value_policy::copy:
-                return eigen_array_cast<props>(src);
-            case return_value_policy::reference_internal:
-                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
-            case return_value_policy::reference:
-            case return_value_policy::automatic:
-            case return_value_policy::automatic_reference:
-                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
-            default:
-                // move, take_ownership don't make any sense for a ref/map:
-                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
-        }
+  // Directly referencing a ref/map's data is a bit dangerous (whatever the
+  // map/ref points to has to stay around), but we'll allow it under the
+  // assumption that you know what you're doing (and have an appropriate
+  // keep_alive in place).  We return a numpy array pointing directly at the
+  // ref's data (The numpy array ends up read-only if the ref was to a const
+  // matrix type.) Note that this means you need to ensure you don't destroy the
+  // object in some other way (e.g. with an appropriate keep_alive, or with a
+  // reference to a statically allocated matrix).
+  static handle cast(const MapType &src, return_value_policy policy,
+                     handle parent) {
+    switch (policy) {
+    case return_value_policy::copy:
+      return eigen_array_cast<props>(src);
+    case return_value_policy::reference_internal:
+      return eigen_array_cast<props>(src, parent,
+                                     is_eigen_mutable_map<MapType>::value);
+    case return_value_policy::reference:
+    case return_value_policy::automatic:
+    case return_value_policy::automatic_reference:
+      return eigen_array_cast<props>(src, none(),
+                                     is_eigen_mutable_map<MapType>::value);
+    default:
+      // move, take_ownership don't make any sense for a ref/map:
+      pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
     }
+  }
 
-    static constexpr auto name = props::descriptor;
+  static constexpr auto name = props::descriptor;
 
-    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
-    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
-    // you end up here if you try anyway.
-    bool load(handle, bool) = delete;
-    operator MapType() = delete;
-    template <typename> using cast_op_type = MapType;
+  // Explicitly delete these: support python -> C++ conversion on these (i.e.
+  // these can be return types but not bound arguments).  We still provide them
+  // (with an explicitly delete) so that you end up here if you try anyway.
+  bool load(handle, bool) = delete;
+  operator MapType() = delete;
+  template <typename> using cast_op_type = MapType;
 };
 
 // We can return any map-like object (but can only load Refs, specialized next):
-template <typename Type> struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>>
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>>
     : eigen_map_caster<Type> {};
 
-// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
-// copying (it requires some extra effort in many cases).
+// Loader for Ref<...> arguments.  See the documentation for info on how to make
+// this work without copying (it requires some extra effort in many cases).
 template <typename PlainObjectType, typename StrideType>
-struct type_caster<
-    Eigen::Ref<PlainObjectType, 0, StrideType>,
-    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>
-> : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+struct type_caster<Eigen::Ref<PlainObjectType, 0, StrideType>,
+                   enable_if_t<is_eigen_dense_map<
+                       Eigen::Ref<PlainObjectType, 0, StrideType>>::value>>
+    : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
 private:
-    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
-    using props = EigenProps<Type>;
-    using Scalar = typename props::Scalar;
-    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
-    using Array = array_t<Scalar, array::forcecast |
-                ((props::row_major ? props::inner_stride : props::outer_stride) == 1 ? array::c_style :
-                 (props::row_major ? props::outer_stride : props::inner_stride) == 1 ? array::f_style : 0)>;
-    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
-    // Delay construction (these have no default constructor)
-    std::unique_ptr<MapType> map;
-    std::unique_ptr<Type> ref;
-    // Our array.  When possible, this is just a numpy array pointing to the source data, but
-    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an incompatible
-    // layout, or is an array of a type that needs to be converted).  Using a numpy temporary
-    // (rather than an Eigen temporary) saves an extra copy when we need both type conversion and
-    // storage order conversion.  (Note that we refuse to use this temporary copy when loading an
-    // argument for a Ref<M> with M non-const, i.e. a read-write reference).
-    Array copy_or_ref;
+  using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+  using props = EigenProps<Type>;
+  using Scalar = typename props::Scalar;
+  using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+  using Array =
+      array_t<Scalar, array::forcecast |
+                          ((props::row_major ? props::inner_stride
+                                             : props::outer_stride) == 1
+                               ? array::c_style
+                               : (props::row_major ? props::outer_stride
+                                                   : props::inner_stride) == 1
+                                     ? array::f_style
+                                     : 0)>;
+  static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+  // Delay construction (these have no default constructor)
+  std::unique_ptr<MapType> map;
+  std::unique_ptr<Type> ref;
+  // Our array.  When possible, this is just a numpy array pointing to the
+  // source data, but sometimes we can't avoid copying (e.g. input is not a
+  // numpy array at all, has an incompatible layout, or is an array of a type
+  // that needs to be converted).  Using a numpy temporary (rather than an Eigen
+  // temporary) saves an extra copy when we need both type conversion and
+  // storage order conversion.  (Note that we refuse to use this temporary copy
+  // when loading an argument for a Ref<M> with M non-const, i.e. a read-write
+  // reference).
+  Array copy_or_ref;
+
 public:
-    bool load(handle src, bool convert) {
-        // First check whether what we have is already an array of the right type.  If not, we can't
-        // avoid a copy (because the copy is also going to do type conversion).
-        bool need_copy = !isinstance<Array>(src);
+  bool load(handle src, bool convert) {
+    // First check whether what we have is already an array of the right type.
+    // If not, we can't avoid a copy (because the copy is also going to do type
+    // conversion).
+    bool need_copy = !isinstance<Array>(src);
 
-        EigenConformable<props::row_major> fits;
-        if (!need_copy) {
-            // We don't need a converting copy, but we also need to check whether the strides are
-            // compatible with the Ref's stride requirements
-            Array aref = reinterpret_borrow<Array>(src);
+    EigenConformable<props::row_major> fits;
+    if (!need_copy) {
+      // We don't need a converting copy, but we also need to check whether the
+      // strides are compatible with the Ref's stride requirements
+      Array aref = reinterpret_borrow<Array>(src);
 
-            if (aref && (!need_writeable || aref.writeable())) {
-                fits = props::conformable(aref);
-                if (!fits) return false; // Incompatible dimensions
-                if (!fits.template stride_compatible<props>())
-                    need_copy = true;
-                else
-                    copy_or_ref = std::move(aref);
-            }
-            else {
-                need_copy = true;
-            }
-        }
-
-        if (need_copy) {
-            // We need to copy: If we need a mutable reference, or we're not supposed to convert
-            // (either because we're in the no-convert overload pass, or because we're explicitly
-            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
-            if (!convert || need_writeable) return false;
-
-            Array copy = Array::ensure(src);
-            if (!copy) return false;
-            fits = props::conformable(copy);
-            if (!fits || !fits.template stride_compatible<props>())
-                return false;
-            copy_or_ref = std::move(copy);
-            loader_life_support::add_patient(copy_or_ref);
-        }
-
-        ref.reset();
-        map.reset(new MapType(data(copy_or_ref), fits.rows, fits.cols, make_stride(fits.stride.outer(), fits.stride.inner())));
-        ref.reset(new Type(*map));
-
-        return true;
+      if (aref && (!need_writeable || aref.writeable())) {
+        fits = props::conformable(aref);
+        if (!fits)
+          return false; // Incompatible dimensions
+        if (!fits.template stride_compatible<props>())
+          need_copy = true;
+        else
+          copy_or_ref = std::move(aref);
+      } else {
+        need_copy = true;
+      }
     }
 
-    operator Type*() { return ref.get(); }
-    operator Type&() { return *ref; }
-    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+    if (need_copy) {
+      // We need to copy: If we need a mutable reference, or we're not supposed
+      // to convert (either because we're in the no-convert overload pass, or
+      // because we're explicitly instructed not to copy (via
+      // `py::arg().noconvert()`) we have to fail loading.
+      if (!convert || need_writeable)
+        return false;
+
+      Array copy = Array::ensure(src);
+      if (!copy)
+        return false;
+      fits = props::conformable(copy);
+      if (!fits || !fits.template stride_compatible<props>())
+        return false;
+      copy_or_ref = std::move(copy);
+      loader_life_support::add_patient(copy_or_ref);
+    }
+
+    ref.reset();
+    map.reset(
+        new MapType(data(copy_or_ref), fits.rows, fits.cols,
+                    make_stride(fits.stride.outer(), fits.stride.inner())));
+    ref.reset(new Type(*map));
+
+    return true;
+  }
+
+  operator Type *() { return ref.get(); }
+  operator Type &() { return *ref; }
+  template <typename _T>
+  using cast_op_type = pybind11::detail::cast_op_type<_T>;
 
 private:
-    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
-    Scalar *data(Array &a) { return a.mutable_data(); }
+  template <typename T = Type,
+            enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+  Scalar *data(Array &a) {
+    return a.mutable_data();
+  }
 
-    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
-    const Scalar *data(Array &a) { return a.data(); }
+  template <typename T = Type,
+            enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+  const Scalar *data(Array &a) {
+    return a.data();
+  }
 
-    // Attempt to figure out a constructor of `Stride` that will work.
-    // If both strides are fixed, use a default constructor:
-    template <typename S> using stride_ctor_default = bool_constant<
-        S::InnerStrideAtCompileTime != Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
-        std::is_default_constructible<S>::value>;
-    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
-    // Eigen::Stride, and use it:
-    template <typename S> using stride_ctor_dual = bool_constant<
-        !stride_ctor_default<S>::value && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
-    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
-    // it (passing whichever stride is dynamic).
-    template <typename S> using stride_ctor_outer = bool_constant<
-        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
-        S::OuterStrideAtCompileTime == Eigen::Dynamic && S::InnerStrideAtCompileTime != Eigen::Dynamic &&
-        std::is_constructible<S, EigenIndex>::value>;
-    template <typename S> using stride_ctor_inner = bool_constant<
-        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
-        S::InnerStrideAtCompileTime == Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
-        std::is_constructible<S, EigenIndex>::value>;
-
-    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
-    static S make_stride(EigenIndex, EigenIndex) { return S(); }
-    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
-    static S make_stride(EigenIndex outer, EigenIndex inner) { return S(outer, inner); }
-    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
-    static S make_stride(EigenIndex outer, EigenIndex) { return S(outer); }
-    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
-    static S make_stride(EigenIndex, EigenIndex inner) { return S(inner); }
+  // Attempt to figure out a constructor of `Stride` that will work.
+  // If both strides are fixed, use a default constructor:
+  template <typename S>
+  using stride_ctor_default =
+      bool_constant<S::InnerStrideAtCompileTime != Eigen::Dynamic &&
+                    S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+                    std::is_default_constructible<S>::value>;
+  // Otherwise, if there is a two-index constructor, assume it is (outer,inner)
+  // like Eigen::Stride, and use it:
+  template <typename S>
+  using stride_ctor_dual =
+      bool_constant<!stride_ctor_default<S>::value &&
+                    std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+  // Otherwise, if there is a one-index constructor, and just one of the strides
+  // is dynamic, use it (passing whichever stride is dynamic).
+  template <typename S>
+  using stride_ctor_outer = bool_constant<
+      !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+      S::OuterStrideAtCompileTime == Eigen::Dynamic &&
+      S::InnerStrideAtCompileTime != Eigen::Dynamic &&
+      std::is_constructible<S, EigenIndex>::value>;
+  template <typename S>
+  using stride_ctor_inner = bool_constant<
+      !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+      S::InnerStrideAtCompileTime == Eigen::Dynamic &&
+      S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+      std::is_constructible<S, EigenIndex>::value>;
 
+  template <typename S = StrideType,
+            enable_if_t<stride_ctor_default<S>::value, int> = 0>
+  static S make_stride(EigenIndex, EigenIndex) {
+    return S();
+  }
+  template <typename S = StrideType,
+            enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+  static S make_stride(EigenIndex outer, EigenIndex inner) {
+    return S(outer, inner);
+  }
+  template <typename S = StrideType,
+            enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+  static S make_stride(EigenIndex outer, EigenIndex) {
+    return S(outer);
+  }
+  template <typename S = StrideType,
+            enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+  static S make_stride(EigenIndex, EigenIndex inner) {
+    return S(inner);
+  }
 };
 
-// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
-// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
-// load() is not supported, but we can cast them into the python domain by first copying to a
-// regular Eigen::Matrix, then casting that.
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are
+// EigenBase, but not EigenDense (i.e. they don't have a data(), at least not
+// with the usual matrix layout). load() is not supported, but we can cast them
+// into the python domain by first copying to a regular Eigen::Matrix, then
+// casting that.
 template <typename Type>
 struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
 protected:
-    using Matrix = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
-    using props = EigenProps<Matrix>;
+  using Matrix = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime,
+                               Type::ColsAtCompileTime>;
+  using props = EigenProps<Matrix>;
+
 public:
-    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
-        handle h = eigen_encapsulate<props>(new Matrix(src));
-        return h;
-    }
-    static handle cast(const Type *src, return_value_policy policy, handle parent) { return cast(*src, policy, parent); }
+  static handle cast(const Type &src, return_value_policy /* policy */,
+                     handle /* parent */) {
+    handle h = eigen_encapsulate<props>(new Matrix(src));
+    return h;
+  }
+  static handle cast(const Type *src, return_value_policy policy,
+                     handle parent) {
+    return cast(*src, policy, parent);
+  }
 
-    static constexpr auto name = props::descriptor;
+  static constexpr auto name = props::descriptor;
 
-    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
-    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
-    // you end up here if you try anyway.
-    bool load(handle, bool) = delete;
-    operator Type() = delete;
-    template <typename> using cast_op_type = Type;
+  // Explicitly delete these: support python -> C++ conversion on these (i.e.
+  // these can be return types but not bound arguments).  We still provide them
+  // (with an explicitly delete) so that you end up here if you try anyway.
+  bool load(handle, bool) = delete;
+  operator Type() = delete;
+  template <typename> using cast_op_type = Type;
 };
 
-template<typename Type>
+template <typename Type>
 struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
-    typedef typename Type::Scalar Scalar;
-    typedef remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())> StorageIndex;
-    typedef typename Type::Index Index;
-    static constexpr bool rowMajor = Type::IsRowMajor;
+  typedef typename Type::Scalar Scalar;
+  typedef remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())>
+      StorageIndex;
+  typedef typename Type::Index Index;
+  static constexpr bool rowMajor = Type::IsRowMajor;
 
-    bool load(handle src, bool) {
-        if (!src)
-            return false;
+  bool load(handle src, bool) {
+    if (!src)
+      return false;
 
-        auto obj = reinterpret_borrow<object>(src);
-        object sparse_module = module::import("scipy.sparse");
-        object matrix_type = sparse_module.attr(
-            rowMajor ? "csr_matrix" : "csc_matrix");
+    auto obj = reinterpret_borrow<object>(src);
+    object sparse_module = module::import("scipy.sparse");
+    object matrix_type =
+        sparse_module.attr(rowMajor ? "csr_matrix" : "csc_matrix");
 
-        if (!obj.get_type().is(matrix_type)) {
-            try {
-                obj = matrix_type(obj);
-            } catch (const error_already_set &) {
-                return false;
-            }
-        }
-
-        auto values = array_t<Scalar>((object) obj.attr("data"));
-        auto innerIndices = array_t<StorageIndex>((object) obj.attr("indices"));
-        auto outerIndices = array_t<StorageIndex>((object) obj.attr("indptr"));
-        auto shape = pybind11::tuple((pybind11::object) obj.attr("shape"));
-        auto nnz = obj.attr("nnz").cast<Index>();
-
-        if (!values || !innerIndices || !outerIndices)
-            return false;
-
-        value = Eigen::MappedSparseMatrix<Scalar, Type::Flags, StorageIndex>(
-            shape[0].cast<Index>(), shape[1].cast<Index>(), nnz,
-            outerIndices.mutable_data(), innerIndices.mutable_data(), values.mutable_data());
-
-        return true;
+    if (!obj.get_type().is(matrix_type)) {
+      try {
+        obj = matrix_type(obj);
+      } catch (const error_already_set &) {
+        return false;
+      }
     }
 
-    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
-        const_cast<Type&>(src).makeCompressed();
+    auto values = array_t<Scalar>((object)obj.attr("data"));
+    auto innerIndices = array_t<StorageIndex>((object)obj.attr("indices"));
+    auto outerIndices = array_t<StorageIndex>((object)obj.attr("indptr"));
+    auto shape = pybind11::tuple((pybind11::object)obj.attr("shape"));
+    auto nnz = obj.attr("nnz").cast<Index>();
 
-        object matrix_type = module::import("scipy.sparse").attr(
-            rowMajor ? "csr_matrix" : "csc_matrix");
+    if (!values || !innerIndices || !outerIndices)
+      return false;
 
-        array data(src.nonZeros(), src.valuePtr());
-        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
-        array innerIndices(src.nonZeros(), src.innerIndexPtr());
+    value = Eigen::MappedSparseMatrix<Scalar, Type::Flags, StorageIndex>(
+        shape[0].cast<Index>(), shape[1].cast<Index>(), nnz,
+        outerIndices.mutable_data(), innerIndices.mutable_data(),
+        values.mutable_data());
 
-        return matrix_type(
-            std::make_tuple(data, innerIndices, outerIndices),
-            std::make_pair(src.rows(), src.cols())
-        ).release();
-    }
+    return true;
+  }
 
-    PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[")
-            + npy_format_descriptor<Scalar>::name + _("]"));
+  static handle cast(const Type &src, return_value_policy /* policy */,
+                     handle /* parent */) {
+    const_cast<Type &>(src).makeCompressed();
+
+    object matrix_type = module::import("scipy.sparse")
+                             .attr(rowMajor ? "csr_matrix" : "csc_matrix");
+
+    array data(src.nonZeros(), src.valuePtr());
+    array outerIndices((rowMajor ? src.rows() : src.cols()) + 1,
+                       src.outerIndexPtr());
+    array innerIndices(src.nonZeros(), src.innerIndexPtr());
+
+    return matrix_type(std::make_tuple(data, innerIndices, outerIndices),
+                       std::make_pair(src.rows(), src.cols()))
+        .release();
+  }
+
+  PYBIND11_TYPE_CASTER(Type,
+                       _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[",
+                                                  "scipy.sparse.csc_matrix[") +
+                           npy_format_descriptor<Scalar>::name + _("]"));
 };
 
 NAMESPACE_END(detail)
 NAMESPACE_END(PYBIND11_NAMESPACE)
 
 #if defined(__GNUG__) || defined(__clang__)
-#  pragma GCC diagnostic pop
+#pragma GCC diagnostic pop
 #elif defined(_MSC_VER)
-#  pragma warning(pop)
+#pragma warning(pop)
 #endif
diff --git a/python/src/pybind11/embed.h b/python/src/pybind11/embed.h
index 72655885e..989e27df3 100644
--- a/python/src/pybind11/embed.h
+++ b/python/src/pybind11/embed.h
@@ -9,23 +9,23 @@
 
 #pragma once
 
-#include "pybind11.h"
 #include "eval.h"
+#include "pybind11.h"
 
 #if defined(PYPY_VERSION)
-#  error Embedding the interpreter is not supported with PyPy
+#error Embedding the interpreter is not supported with PyPy
 #endif
 
 #if PY_MAJOR_VERSION >= 3
-#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
-      extern "C" PyObject *pybind11_init_impl_##name() { \
-          return pybind11_init_wrapper_##name();         \
-      }
+#define PYBIND11_EMBEDDED_MODULE_IMPL(name)                                    \
+  extern "C" PyObject *pybind11_init_impl_##name() {                           \
+    return pybind11_init_wrapper_##name();                                     \
+  }
 #else
-#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
-      extern "C" void pybind11_init_impl_##name() {      \
-          pybind11_init_wrapper_##name();                \
-      }
+#define PYBIND11_EMBEDDED_MODULE_IMPL(name)                                    \
+  extern "C" void pybind11_init_impl_##name() {                                \
+    pybind11_init_wrapper_##name();                                            \
+  }
 #endif
 
 /** \rst
@@ -43,75 +43,78 @@
             });
         }
  \endrst */
-#define PYBIND11_EMBEDDED_MODULE(name, variable)                              \
-    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);    \
-    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {        \
-        auto m = pybind11::module(PYBIND11_TOSTRING(name));                   \
-        try {                                                                 \
-            PYBIND11_CONCAT(pybind11_init_, name)(m);                         \
-            return m.ptr();                                                   \
-        } catch (pybind11::error_already_set &e) {                            \
-            PyErr_SetString(PyExc_ImportError, e.what());                     \
-            return nullptr;                                                   \
-        } catch (const std::exception &e) {                                   \
-            PyErr_SetString(PyExc_ImportError, e.what());                     \
-            return nullptr;                                                   \
-        }                                                                     \
-    }                                                                         \
-    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                       \
-    pybind11::detail::embedded_module name(PYBIND11_TOSTRING(name),           \
-                               PYBIND11_CONCAT(pybind11_init_impl_, name));   \
-    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
-
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                               \
+  static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);       \
+  static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {           \
+    auto m = pybind11::module(PYBIND11_TOSTRING(name));                        \
+    try {                                                                      \
+      PYBIND11_CONCAT(pybind11_init_, name)(m);                                \
+      return m.ptr();                                                          \
+    } catch (pybind11::error_already_set & e) {                                \
+      PyErr_SetString(PyExc_ImportError, e.what());                            \
+      return nullptr;                                                          \
+    } catch (const std::exception &e) {                                        \
+      PyErr_SetString(PyExc_ImportError, e.what());                            \
+      return nullptr;                                                          \
+    }                                                                          \
+  }                                                                            \
+  PYBIND11_EMBEDDED_MODULE_IMPL(name)                                          \
+  pybind11::detail::embedded_module name(                                      \
+      PYBIND11_TOSTRING(name), PYBIND11_CONCAT(pybind11_init_impl_, name));    \
+  void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module & variable)
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
-/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks.
+/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error
+/// checks.
 struct embedded_module {
 #if PY_MAJOR_VERSION >= 3
-    using init_t = PyObject *(*)();
+  using init_t = PyObject *(*)();
 #else
-    using init_t = void (*)();
+  using init_t = void (*)();
 #endif
-    embedded_module(const char *name, init_t init) {
-        if (Py_IsInitialized())
-            pybind11_fail("Can't add new modules after the interpreter has been initialized");
+  embedded_module(const char *name, init_t init) {
+    if (Py_IsInitialized())
+      pybind11_fail(
+          "Can't add new modules after the interpreter has been initialized");
 
-        auto result = PyImport_AppendInittab(name, init);
-        if (result == -1)
-            pybind11_fail("Insufficient memory to add a new module");
-    }
+    auto result = PyImport_AppendInittab(name, init);
+    if (result == -1)
+      pybind11_fail("Insufficient memory to add a new module");
+  }
 };
 
 NAMESPACE_END(detail)
 
 /** \rst
-    Initialize the Python interpreter. No other pybind11 or CPython API functions can be
-    called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
-    optional parameter can be used to skip the registration of signal handlers (see the
-    `Python documentation`_ for details). Calling this function again after the interpreter
-    has already been initialized is a fatal error.
+    Initialize the Python interpreter. No other pybind11 or CPython API
+ functions can be called before this is done; with the exception of
+ `PYBIND11_EMBEDDED_MODULE`. The optional parameter can be used to skip the
+ registration of signal handlers (see the `Python documentation`_ for details).
+ Calling this function again after the interpreter has already been initialized
+ is a fatal error.
 
-    If initializing the Python interpreter fails, then the program is terminated.  (This
-    is controlled by the CPython runtime and is an exception to pybind11's normal behavior
-    of throwing exceptions on errors.)
+    If initializing the Python interpreter fails, then the program is
+ terminated.  (This is controlled by the CPython runtime and is an exception to
+ pybind11's normal behavior of throwing exceptions on errors.)
 
-    .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx
- \endrst */
+    .. _Python documentation:
+ https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx \endrst */
 inline void initialize_interpreter(bool init_signal_handlers = true) {
-    if (Py_IsInitialized())
-        pybind11_fail("The interpreter is already running");
+  if (Py_IsInitialized())
+    pybind11_fail("The interpreter is already running");
 
-    Py_InitializeEx(init_signal_handlers ? 1 : 0);
+  Py_InitializeEx(init_signal_handlers ? 1 : 0);
 
-    // Make .py files in the working directory available by default
-    module::import("sys").attr("path").cast<list>().append(".");
+  // Make .py files in the working directory available by default
+  module::import("sys").attr("path").cast<list>().append(".");
 }
 
 /** \rst
-    Shut down the Python interpreter. No pybind11 or CPython API functions can be called
-    after this. In addition, pybind11 objects must not outlive the interpreter:
+    Shut down the Python interpreter. No pybind11 or CPython API functions can
+ be called after this. In addition, pybind11 objects must not outlive the
+ interpreter:
 
     .. code-block:: cpp
 
@@ -136,32 +139,33 @@ inline void initialize_interpreter(bool init_signal_handlers = true) {
 
     .. warning::
 
-        The interpreter can be restarted by calling `initialize_interpreter` again.
-        Modules created using pybind11 can be safely re-initialized. However, Python
-        itself cannot completely unload binary extension modules and there are several
-        caveats with regard to interpreter restarting. All the details can be found
-        in the CPython documentation. In short, not all interpreter memory may be
+        The interpreter can be restarted by calling `initialize_interpreter`
+ again. Modules created using pybind11 can be safely re-initialized. However,
+ Python itself cannot completely unload binary extension modules and there are
+ several caveats with regard to interpreter restarting. All the details can be
+ found in the CPython documentation. In short, not all interpreter memory may be
         freed, either due to reference cycles or user-created global data.
 
  \endrst */
 inline void finalize_interpreter() {
-    handle builtins(PyEval_GetBuiltins());
-    const char *id = PYBIND11_INTERNALS_ID;
+  handle builtins(PyEval_GetBuiltins());
+  const char *id = PYBIND11_INTERNALS_ID;
 
-    // Get the internals pointer (without creating it if it doesn't exist).  It's possible for the
-    // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()`
-    // during destruction), so we get the pointer-pointer here and check it after Py_Finalize().
-    detail::internals **internals_ptr_ptr = detail::get_internals_pp();
-    // It could also be stashed in builtins, so look there too:
-    if (builtins.contains(id) && isinstance<capsule>(builtins[id]))
-        internals_ptr_ptr = capsule(builtins[id]);
+  // Get the internals pointer (without creating it if it doesn't exist).  It's
+  // possible for the internals to be created during Py_Finalize() (e.g. if a
+  // py::capsule calls `get_internals()` during destruction), so we get the
+  // pointer-pointer here and check it after Py_Finalize().
+  detail::internals **internals_ptr_ptr = detail::get_internals_pp();
+  // It could also be stashed in builtins, so look there too:
+  if (builtins.contains(id) && isinstance<capsule>(builtins[id]))
+    internals_ptr_ptr = capsule(builtins[id]);
 
-    Py_Finalize();
+  Py_Finalize();
 
-    if (internals_ptr_ptr) {
-        delete *internals_ptr_ptr;
-        *internals_ptr_ptr = nullptr;
-    }
+  if (internals_ptr_ptr) {
+    delete *internals_ptr_ptr;
+    *internals_ptr_ptr = nullptr;
+  }
 }
 
 /** \rst
@@ -179,22 +183,24 @@ inline void finalize_interpreter() {
  \endrst */
 class scoped_interpreter {
 public:
-    scoped_interpreter(bool init_signal_handlers = true) {
-        initialize_interpreter(init_signal_handlers);
-    }
+  scoped_interpreter(bool init_signal_handlers = true) {
+    initialize_interpreter(init_signal_handlers);
+  }
 
-    scoped_interpreter(const scoped_interpreter &) = delete;
-    scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; }
-    scoped_interpreter &operator=(const scoped_interpreter &) = delete;
-    scoped_interpreter &operator=(scoped_interpreter &&) = delete;
+  scoped_interpreter(const scoped_interpreter &) = delete;
+  scoped_interpreter(scoped_interpreter &&other) noexcept {
+    other.is_valid = false;
+  }
+  scoped_interpreter &operator=(const scoped_interpreter &) = delete;
+  scoped_interpreter &operator=(scoped_interpreter &&) = delete;
 
-    ~scoped_interpreter() {
-        if (is_valid)
-            finalize_interpreter();
-    }
+  ~scoped_interpreter() {
+    if (is_valid)
+      finalize_interpreter();
+  }
 
 private:
-    bool is_valid = true;
+  bool is_valid = true;
 };
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/eval.h b/python/src/pybind11/eval.h
index ea85ba1db..a1b5c1263 100644
--- a/python/src/pybind11/eval.h
+++ b/python/src/pybind11/eval.h
@@ -2,8 +2,8 @@
     pybind11/exec.h: Support for evaluating Python expressions and statements
     from strings and files
 
-    Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de> and
-                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+    Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de>
+   and Wenzel Jakob <wenzel.jakob@epfl.ch>
 
     All rights reserved. Use of this source code is governed by a
     BSD-style license that can be found in the LICENSE file.
@@ -16,102 +16,119 @@
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
 enum eval_mode {
-    /// Evaluate a string containing an isolated expression
-    eval_expr,
+  /// Evaluate a string containing an isolated expression
+  eval_expr,
 
-    /// Evaluate a string containing a single statement. Returns \c none
-    eval_single_statement,
+  /// Evaluate a string containing a single statement. Returns \c none
+  eval_single_statement,
 
-    /// Evaluate a string containing a sequence of statement. Returns \c none
-    eval_statements
+  /// Evaluate a string containing a sequence of statement. Returns \c none
+  eval_statements
 };
 
 template <eval_mode mode = eval_expr>
 object eval(str expr, object global = globals(), object local = object()) {
-    if (!local)
-        local = global;
+  if (!local)
+    local = global;
 
-    /* PyRun_String does not accept a PyObject / encoding specifier,
-       this seems to be the only alternative */
-    std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr;
+  /* PyRun_String does not accept a PyObject / encoding specifier,
+     this seems to be the only alternative */
+  std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string)expr;
 
-    int start;
-    switch (mode) {
-        case eval_expr:             start = Py_eval_input;   break;
-        case eval_single_statement: start = Py_single_input; break;
-        case eval_statements:       start = Py_file_input;   break;
-        default: pybind11_fail("invalid evaluation mode");
-    }
+  int start;
+  switch (mode) {
+  case eval_expr:
+    start = Py_eval_input;
+    break;
+  case eval_single_statement:
+    start = Py_single_input;
+    break;
+  case eval_statements:
+    start = Py_file_input;
+    break;
+  default:
+    pybind11_fail("invalid evaluation mode");
+  }
 
-    PyObject *result = PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr());
-    if (!result)
-        throw error_already_set();
-    return reinterpret_steal<object>(result);
+  PyObject *result =
+      PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr());
+  if (!result)
+    throw error_already_set();
+  return reinterpret_steal<object>(result);
 }
 
 template <eval_mode mode = eval_expr, size_t N>
-object eval(const char (&s)[N], object global = globals(), object local = object()) {
-    /* Support raw string literals by removing common leading whitespace */
-    auto expr = (s[0] == '\n') ? str(module::import("textwrap").attr("dedent")(s))
-                               : str(s);
-    return eval<mode>(expr, global, local);
+object eval(const char (&s)[N], object global = globals(),
+            object local = object()) {
+  /* Support raw string literals by removing common leading whitespace */
+  auto expr = (s[0] == '\n') ? str(module::import("textwrap").attr("dedent")(s))
+                             : str(s);
+  return eval<mode>(expr, global, local);
 }
 
 inline void exec(str expr, object global = globals(), object local = object()) {
-    eval<eval_statements>(expr, global, local);
+  eval<eval_statements>(expr, global, local);
 }
 
 template <size_t N>
-void exec(const char (&s)[N], object global = globals(), object local = object()) {
-    eval<eval_statements>(s, global, local);
+void exec(const char (&s)[N], object global = globals(),
+          object local = object()) {
+  eval<eval_statements>(s, global, local);
 }
 
 template <eval_mode mode = eval_statements>
-object eval_file(str fname, object global = globals(), object local = object()) {
-    if (!local)
-        local = global;
+object eval_file(str fname, object global = globals(),
+                 object local = object()) {
+  if (!local)
+    local = global;
 
-    int start;
-    switch (mode) {
-        case eval_expr:             start = Py_eval_input;   break;
-        case eval_single_statement: start = Py_single_input; break;
-        case eval_statements:       start = Py_file_input;   break;
-        default: pybind11_fail("invalid evaluation mode");
-    }
+  int start;
+  switch (mode) {
+  case eval_expr:
+    start = Py_eval_input;
+    break;
+  case eval_single_statement:
+    start = Py_single_input;
+    break;
+  case eval_statements:
+    start = Py_file_input;
+    break;
+  default:
+    pybind11_fail("invalid evaluation mode");
+  }
 
-    int closeFile = 1;
-    std::string fname_str = (std::string) fname;
+  int closeFile = 1;
+  std::string fname_str = (std::string)fname;
 #if PY_VERSION_HEX >= 0x03040000
-    FILE *f = _Py_fopen_obj(fname.ptr(), "r");
+  FILE *f = _Py_fopen_obj(fname.ptr(), "r");
 #elif PY_VERSION_HEX >= 0x03000000
-    FILE *f = _Py_fopen(fname.ptr(), "r");
+  FILE *f = _Py_fopen(fname.ptr(), "r");
 #else
-    /* No unicode support in open() :( */
-    auto fobj = reinterpret_steal<object>(PyFile_FromString(
-        const_cast<char *>(fname_str.c_str()),
-        const_cast<char*>("r")));
-    FILE *f = nullptr;
-    if (fobj)
-        f = PyFile_AsFile(fobj.ptr());
-    closeFile = 0;
+  /* No unicode support in open() :( */
+  auto fobj = reinterpret_steal<object>(PyFile_FromString(
+      const_cast<char *>(fname_str.c_str()), const_cast<char *>("r")));
+  FILE *f = nullptr;
+  if (fobj)
+    f = PyFile_AsFile(fobj.ptr());
+  closeFile = 0;
 #endif
-    if (!f) {
-        PyErr_Clear();
-        pybind11_fail("File \"" + fname_str + "\" could not be opened!");
-    }
+  if (!f) {
+    PyErr_Clear();
+    pybind11_fail("File \"" + fname_str + "\" could not be opened!");
+  }
 
 #if PY_VERSION_HEX < 0x03000000 && defined(PYPY_VERSION)
-    PyObject *result = PyRun_File(f, fname_str.c_str(), start, global.ptr(),
-                                  local.ptr());
-    (void) closeFile;
+  PyObject *result =
+      PyRun_File(f, fname_str.c_str(), start, global.ptr(), local.ptr());
+  (void)closeFile;
 #else
-    PyObject *result = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(),
-                                    local.ptr(), closeFile);
+  PyObject *result = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(),
+                                  local.ptr(), closeFile);
 #endif
 
-    if (!result)
-        throw error_already_set();
-    return reinterpret_steal<object>(result);
+  if (!result)
+    throw error_already_set();
+  return reinterpret_steal<object>(result);
 }
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/functional.h b/python/src/pybind11/functional.h
index 00457e965..d97585805 100644
--- a/python/src/pybind11/functional.h
+++ b/python/src/pybind11/functional.h
@@ -17,91 +17,99 @@ NAMESPACE_BEGIN(detail)
 
 template <typename Return, typename... Args>
 struct type_caster<std::function<Return(Args...)>> {
-    using type = std::function<Return(Args...)>;
-    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
-    using function_type = Return (*) (Args...);
+  using type = std::function<Return(Args...)>;
+  using retval_type =
+      conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+  using function_type = Return (*)(Args...);
 
 public:
-    bool load(handle src, bool convert) {
-        if (src.is_none()) {
-            // Defer accepting None to other overloads (if we aren't in convert mode):
-            if (!convert) return false;
-            return true;
-        }
+  bool load(handle src, bool convert) {
+    if (src.is_none()) {
+      // Defer accepting None to other overloads (if we aren't in convert mode):
+      if (!convert)
+        return false;
+      return true;
+    }
 
-        if (!isinstance<function>(src))
-            return false;
+    if (!isinstance<function>(src))
+      return false;
 
-        auto func = reinterpret_borrow<function>(src);
+    auto func = reinterpret_borrow<function>(src);
 
-        /*
-           When passing a C++ function as an argument to another C++
-           function via Python, every function call would normally involve
-           a full C++ -> Python -> C++ roundtrip, which can be prohibitive.
-           Here, we try to at least detect the case where the function is
-           stateless (i.e. function pointer or lambda function without
-           captured variables), in which case the roundtrip can be avoided.
-         */
-        if (auto cfunc = func.cpp_function()) {
-            auto c = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(cfunc.ptr()));
-            auto rec = (function_record *) c;
+    /*
+       When passing a C++ function as an argument to another C++
+       function via Python, every function call would normally involve
+       a full C++ -> Python -> C++ roundtrip, which can be prohibitive.
+       Here, we try to at least detect the case where the function is
+       stateless (i.e. function pointer or lambda function without
+       captured variables), in which case the roundtrip can be avoided.
+     */
+    if (auto cfunc = func.cpp_function()) {
+      auto c = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(cfunc.ptr()));
+      auto rec = (function_record *)c;
 
-            if (rec && rec->is_stateless &&
-                    same_type(typeid(function_type), *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
-                struct capture { function_type f; };
-                value = ((capture *) &rec->data)->f;
-                return true;
-            }
-        }
-
-        // ensure GIL is held during functor destruction
-        struct func_handle {
-            function f;
-            func_handle(function&& f_) : f(std::move(f_)) {}
-            func_handle(const func_handle&) = default;
-            ~func_handle() {
-                gil_scoped_acquire acq;
-                function kill_f(std::move(f));
-            }
+      if (rec && rec->is_stateless &&
+          same_type(typeid(function_type),
+                    *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
+        struct capture {
+          function_type f;
         };
-
-        // value = [hfunc = func_handle(std::move(func))](Args... args) -> Return {
-        //     gil_scoped_acquire acq;
-        //     object retval(hfunc.f(std::forward<Args>(args)...));
-        //     /* Visual studio 2015 parser issue: need parentheses around this expression */
-        //     return (retval.template cast<Return>());
-        // };
-
-        struct func_wrapper {
-            func_handle hfunc;
-            func_wrapper(func_handle&& hf): hfunc(std::move(hf)) {}
-            Return operator()(Args... args) const {
-                gil_scoped_acquire acq;
-                object retval(hfunc.f(std::forward<Args>(args)...));
-                /* Visual studio 2015 parser issue: need parentheses around this expression */
-                return (retval.template cast<Return>());
-            }
-        };
-
-        value = func_wrapper(func_handle(std::move(func)));
-
+        value = ((capture *)&rec->data)->f;
         return true;
+      }
     }
 
-    template <typename Func>
-    static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) {
-        if (!f_)
-            return none().inc_ref();
+    // ensure GIL is held during functor destruction
+    struct func_handle {
+      function f;
+      func_handle(function &&f_) : f(std::move(f_)) {}
+      func_handle(const func_handle &) = default;
+      ~func_handle() {
+        gil_scoped_acquire acq;
+        function kill_f(std::move(f));
+      }
+    };
 
-        auto result = f_.template target<function_type>();
-        if (result)
-            return cpp_function(*result, policy).release();
-        else
-            return cpp_function(std::forward<Func>(f_), policy).release();
-    }
+    // value = [hfunc = func_handle(std::move(func))](Args... args) -> Return {
+    //     gil_scoped_acquire acq;
+    //     object retval(hfunc.f(std::forward<Args>(args)...));
+    //     /* Visual studio 2015 parser issue: need parentheses around this
+    //     expression */ return (retval.template cast<Return>());
+    // };
 
-    PYBIND11_TYPE_CASTER(type, _("Callable[[") + concat(make_caster<Args>::name...) + _("], ")
-                               + make_caster<retval_type>::name + _("]"));
+    struct func_wrapper {
+      func_handle hfunc;
+      func_wrapper(func_handle &&hf) : hfunc(std::move(hf)) {}
+      Return operator()(Args... args) const {
+        gil_scoped_acquire acq;
+        object retval(hfunc.f(std::forward<Args>(args)...));
+        /* Visual studio 2015 parser issue: need parentheses around this
+         * expression */
+        return (retval.template cast<Return>());
+      }
+    };
+
+    value = func_wrapper(func_handle(std::move(func)));
+
+    return true;
+  }
+
+  template <typename Func>
+  static handle cast(Func &&f_, return_value_policy policy,
+                     handle /* parent */) {
+    if (!f_)
+      return none().inc_ref();
+
+    auto result = f_.template target<function_type>();
+    if (result)
+      return cpp_function(*result, policy).release();
+    else
+      return cpp_function(std::forward<Func>(f_), policy).release();
+  }
+
+  PYBIND11_TYPE_CASTER(type, _("Callable[[") +
+                                 concat(make_caster<Args>::name...) + _("], ") +
+                                 make_caster<retval_type>::name + _("]"));
 };
 
 NAMESPACE_END(detail)
diff --git a/python/src/pybind11/iostream.h b/python/src/pybind11/iostream.h
index 72baef8fd..7e6a6ae52 100644
--- a/python/src/pybind11/iostream.h
+++ b/python/src/pybind11/iostream.h
@@ -1,5 +1,6 @@
 /*
-    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to
+   Python
 
     Copyright (c) 2017 Henry F. Schreiner
 
@@ -11,11 +12,11 @@
 
 #include "pybind11.h"
 
-#include <streambuf>
-#include <ostream>
-#include <string>
-#include <memory>
 #include <iostream>
+#include <memory>
+#include <ostream>
+#include <streambuf>
+#include <string>
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
@@ -23,56 +24,50 @@ NAMESPACE_BEGIN(detail)
 // Buffer that writes to Python instead of C++
 class pythonbuf : public std::streambuf {
 private:
-    using traits_type = std::streambuf::traits_type;
+  using traits_type = std::streambuf::traits_type;
 
-    const size_t buf_size;
-    std::unique_ptr<char[]> d_buffer;
-    object pywrite;
-    object pyflush;
+  const size_t buf_size;
+  std::unique_ptr<char[]> d_buffer;
+  object pywrite;
+  object pyflush;
 
-    int overflow(int c) {
-        if (!traits_type::eq_int_type(c, traits_type::eof())) {
-            *pptr() = traits_type::to_char_type(c);
-            pbump(1);
-        }
-        return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+  int overflow(int c) {
+    if (!traits_type::eq_int_type(c, traits_type::eof())) {
+      *pptr() = traits_type::to_char_type(c);
+      pbump(1);
     }
+    return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+  }
 
-    int sync() {
-        if (pbase() != pptr()) {
-            // This subtraction cannot be negative, so dropping the sign
-            str line(pbase(), static_cast<size_t>(pptr() - pbase()));
+  int sync() {
+    if (pbase() != pptr()) {
+      // This subtraction cannot be negative, so dropping the sign
+      str line(pbase(), static_cast<size_t>(pptr() - pbase()));
 
-            {
-                gil_scoped_acquire tmp;
-                pywrite(line);
-                pyflush();
-            }
+      {
+        gil_scoped_acquire tmp;
+        pywrite(line);
+        pyflush();
+      }
 
-            setp(pbase(), epptr());
-        }
-        return 0;
+      setp(pbase(), epptr());
     }
+    return 0;
+  }
 
 public:
+  pythonbuf(object pyostream, size_t buffer_size = 1024)
+      : buf_size(buffer_size), d_buffer(new char[buf_size]),
+        pywrite(pyostream.attr("write")), pyflush(pyostream.attr("flush")) {
+    setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
+  }
 
-    pythonbuf(object pyostream, size_t buffer_size = 1024)
-        : buf_size(buffer_size),
-          d_buffer(new char[buf_size]),
-          pywrite(pyostream.attr("write")),
-          pyflush(pyostream.attr("flush")) {
-        setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
-    }
-
-    /// Sync before destroy
-    ~pythonbuf() {
-        sync();
-    }
+  /// Sync before destroy
+  ~pythonbuf() { sync(); }
 };
 
 NAMESPACE_END(detail)
 
-
 /** \rst
     This a move-only guard that redirects output.
 
@@ -93,35 +88,32 @@ NAMESPACE_END(detail)
     .. code-block:: cpp
 
         {
-            py::scoped_ostream_redirect output{std::cerr, py::module::import("sys").attr("stderr")};
-            std::cerr << "Hello, World!";
+            py::scoped_ostream_redirect output{std::cerr,
+ py::module::import("sys").attr("stderr")}; std::cerr << "Hello, World!";
         }
  \endrst */
 class scoped_ostream_redirect {
 protected:
-    std::streambuf *old;
-    std::ostream &costream;
-    detail::pythonbuf buffer;
+  std::streambuf *old;
+  std::ostream &costream;
+  detail::pythonbuf buffer;
 
 public:
-    scoped_ostream_redirect(
-            std::ostream &costream = std::cout,
-            object pyostream = module::import("sys").attr("stdout"))
-        : costream(costream), buffer(pyostream) {
-        old = costream.rdbuf(&buffer);
-    }
+  scoped_ostream_redirect(
+      std::ostream &costream = std::cout,
+      object pyostream = module::import("sys").attr("stdout"))
+      : costream(costream), buffer(pyostream) {
+    old = costream.rdbuf(&buffer);
+  }
 
-    ~scoped_ostream_redirect() {
-        costream.rdbuf(old);
-    }
+  ~scoped_ostream_redirect() { costream.rdbuf(old); }
 
-    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
-    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
-    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
-    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+  scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+  scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+  scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+  scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
 };
 
-
 /** \rst
     Like `scoped_ostream_redirect`, but redirects cerr by default. This class
     is provided primary to make ``py::call_guard`` easier to make.
@@ -135,44 +127,44 @@ public:
 \endrst */
 class scoped_estream_redirect : public scoped_ostream_redirect {
 public:
-    scoped_estream_redirect(
-            std::ostream &costream = std::cerr,
-            object pyostream = module::import("sys").attr("stderr"))
-        : scoped_ostream_redirect(costream,pyostream) {}
+  scoped_estream_redirect(
+      std::ostream &costream = std::cerr,
+      object pyostream = module::import("sys").attr("stderr"))
+      : scoped_ostream_redirect(costream, pyostream) {}
 };
 
-
 NAMESPACE_BEGIN(detail)
 
 // Class to redirect output as a context manager. C++ backend.
 class OstreamRedirect {
-    bool do_stdout_;
-    bool do_stderr_;
-    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
-    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+  bool do_stdout_;
+  bool do_stderr_;
+  std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+  std::unique_ptr<scoped_estream_redirect> redirect_stderr;
 
 public:
-    OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
-        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+  OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+      : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
 
-    void enter() {
-        if (do_stdout_)
-            redirect_stdout.reset(new scoped_ostream_redirect());
-        if (do_stderr_)
-            redirect_stderr.reset(new scoped_estream_redirect());
-    }
+  void enter() {
+    if (do_stdout_)
+      redirect_stdout.reset(new scoped_ostream_redirect());
+    if (do_stderr_)
+      redirect_stderr.reset(new scoped_estream_redirect());
+  }
 
-    void exit() {
-        redirect_stdout.reset();
-        redirect_stderr.reset();
-    }
+  void exit() {
+    redirect_stdout.reset();
+    redirect_stderr.reset();
+  }
 };
 
 NAMESPACE_END(detail)
 
 /** \rst
     This is a helper function to add a C++ redirect context manager to Python
-    instead of using a C++ guard. To use it, add the following to your binding code:
+    instead of using a C++ guard. To use it, add the following to your binding
+ code:
 
     .. code-block:: cpp
 
@@ -197,11 +189,13 @@ NAMESPACE_END(detail)
             m.noisy_function_with_error_printing()
 
  \endrst */
-inline class_<detail::OstreamRedirect> add_ostream_redirect(module m, std::string name = "ostream_redirect") {
-    return class_<detail::OstreamRedirect>(m, name.c_str(), module_local())
-        .def(init<bool,bool>(), arg("stdout")=true, arg("stderr")=true)
-        .def("__enter__", &detail::OstreamRedirect::enter)
-        .def("__exit__", [](detail::OstreamRedirect &self_, args) { self_.exit(); });
+inline class_<detail::OstreamRedirect>
+add_ostream_redirect(module m, std::string name = "ostream_redirect") {
+  return class_<detail::OstreamRedirect>(m, name.c_str(), module_local())
+      .def(init<bool, bool>(), arg("stdout") = true, arg("stderr") = true)
+      .def("__enter__", &detail::OstreamRedirect::enter)
+      .def("__exit__",
+           [](detail::OstreamRedirect &self_, args) { self_.exit(); });
 }
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/numpy.h b/python/src/pybind11/numpy.h
index b2a02e024..1993d5479 100644
--- a/python/src/pybind11/numpy.h
+++ b/python/src/pybind11/numpy.h
@@ -9,29 +9,30 @@
 
 #pragma once
 
-#include "pybind11.h"
 #include "complex.h"
-#include <numeric>
+#include "pybind11.h"
 #include <algorithm>
 #include <array>
 #include <cstdlib>
 #include <cstring>
+#include <functional>
+#include <numeric>
 #include <sstream>
 #include <string>
-#include <functional>
+#include <typeindex>
 #include <utility>
 #include <vector>
-#include <typeindex>
 
 #if defined(_MSC_VER)
-#  pragma warning(push)
-#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#pragma warning(push)
+#pragma warning(                                                               \
+    disable : 4127) // warning C4127: Conditional expression is constant
 #endif
 
-/* This will be true on all flat address space platforms and allows us to reduce the
-   whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
-   and dimension types (e.g. shape, strides, indexing), instead of inflicting this
-   upon the library user. */
+/* This will be true on all flat address space platforms and allows us to reduce
+   the whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for
+   all size and dimension types (e.g. shape, strides, indexing), instead of
+   inflicting this upon the library user. */
 static_assert(sizeof(ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
@@ -42,1535 +43,1707 @@ NAMESPACE_BEGIN(detail)
 template <typename type, typename SFINAE = void> struct npy_format_descriptor;
 
 struct PyArrayDescr_Proxy {
-    PyObject_HEAD
-    PyObject *typeobj;
-    char kind;
-    char type;
-    char byteorder;
-    char flags;
-    int type_num;
-    int elsize;
-    int alignment;
-    char *subarray;
-    PyObject *fields;
-    PyObject *names;
+  PyObject_HEAD PyObject *typeobj;
+  char kind;
+  char type;
+  char byteorder;
+  char flags;
+  int type_num;
+  int elsize;
+  int alignment;
+  char *subarray;
+  PyObject *fields;
+  PyObject *names;
 };
 
 struct PyArray_Proxy {
-    PyObject_HEAD
-    char *data;
-    int nd;
-    ssize_t *dimensions;
-    ssize_t *strides;
-    PyObject *base;
-    PyObject *descr;
-    int flags;
+  PyObject_HEAD char *data;
+  int nd;
+  ssize_t *dimensions;
+  ssize_t *strides;
+  PyObject *base;
+  PyObject *descr;
+  int flags;
 };
 
 struct PyVoidScalarObject_Proxy {
-    PyObject_VAR_HEAD
-    char *obval;
-    PyArrayDescr_Proxy *descr;
-    int flags;
-    PyObject *base;
+  PyObject_VAR_HEAD char *obval;
+  PyArrayDescr_Proxy *descr;
+  int flags;
+  PyObject *base;
 };
 
 struct numpy_type_info {
-    PyObject* dtype_ptr;
-    std::string format_str;
+  PyObject *dtype_ptr;
+  std::string format_str;
 };
 
 struct numpy_internals {
-    std::unordered_map<std::type_index, numpy_type_info> registered_dtypes;
+  std::unordered_map<std::type_index, numpy_type_info> registered_dtypes;
 
-    numpy_type_info *get_type_info(const std::type_info& tinfo, bool throw_if_missing = true) {
-        auto it = registered_dtypes.find(std::type_index(tinfo));
-        if (it != registered_dtypes.end())
-            return &(it->second);
-        if (throw_if_missing)
-            pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name());
-        return nullptr;
-    }
+  numpy_type_info *get_type_info(const std::type_info &tinfo,
+                                 bool throw_if_missing = true) {
+    auto it = registered_dtypes.find(std::type_index(tinfo));
+    if (it != registered_dtypes.end())
+      return &(it->second);
+    if (throw_if_missing)
+      pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name());
+    return nullptr;
+  }
 
-    template<typename T> numpy_type_info *get_type_info(bool throw_if_missing = true) {
-        return get_type_info(typeid(typename std::remove_cv<T>::type), throw_if_missing);
-    }
+  template <typename T>
+  numpy_type_info *get_type_info(bool throw_if_missing = true) {
+    return get_type_info(typeid(typename std::remove_cv<T>::type),
+                         throw_if_missing);
+  }
 };
 
-inline PYBIND11_NOINLINE void load_numpy_internals(numpy_internals* &ptr) {
-    ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
+inline PYBIND11_NOINLINE void load_numpy_internals(numpy_internals *&ptr) {
+  ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
 }
 
-inline numpy_internals& get_numpy_internals() {
-    static numpy_internals* ptr = nullptr;
-    if (!ptr)
-        load_numpy_internals(ptr);
-    return *ptr;
+inline numpy_internals &get_numpy_internals() {
+  static numpy_internals *ptr = nullptr;
+  if (!ptr)
+    load_numpy_internals(ptr);
+  return *ptr;
 }
 
 struct npy_api {
-    enum constants {
-        NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
-        NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
-        NPY_ARRAY_OWNDATA_ = 0x0004,
-        NPY_ARRAY_FORCECAST_ = 0x0010,
-        NPY_ARRAY_ENSUREARRAY_ = 0x0040,
-        NPY_ARRAY_ALIGNED_ = 0x0100,
-        NPY_ARRAY_WRITEABLE_ = 0x0400,
-        NPY_BOOL_ = 0,
-        NPY_BYTE_, NPY_UBYTE_,
-        NPY_SHORT_, NPY_USHORT_,
-        NPY_INT_, NPY_UINT_,
-        NPY_LONG_, NPY_ULONG_,
-        NPY_LONGLONG_, NPY_ULONGLONG_,
-        NPY_FLOAT_, NPY_DOUBLE_, NPY_LONGDOUBLE_,
-        NPY_CFLOAT_, NPY_CDOUBLE_, NPY_CLONGDOUBLE_,
-        NPY_OBJECT_ = 17,
-        NPY_STRING_, NPY_UNICODE_, NPY_VOID_
-    };
+  enum constants {
+    NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
+    NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
+    NPY_ARRAY_OWNDATA_ = 0x0004,
+    NPY_ARRAY_FORCECAST_ = 0x0010,
+    NPY_ARRAY_ENSUREARRAY_ = 0x0040,
+    NPY_ARRAY_ALIGNED_ = 0x0100,
+    NPY_ARRAY_WRITEABLE_ = 0x0400,
+    NPY_BOOL_ = 0,
+    NPY_BYTE_,
+    NPY_UBYTE_,
+    NPY_SHORT_,
+    NPY_USHORT_,
+    NPY_INT_,
+    NPY_UINT_,
+    NPY_LONG_,
+    NPY_ULONG_,
+    NPY_LONGLONG_,
+    NPY_ULONGLONG_,
+    NPY_FLOAT_,
+    NPY_DOUBLE_,
+    NPY_LONGDOUBLE_,
+    NPY_CFLOAT_,
+    NPY_CDOUBLE_,
+    NPY_CLONGDOUBLE_,
+    NPY_OBJECT_ = 17,
+    NPY_STRING_,
+    NPY_UNICODE_,
+    NPY_VOID_
+  };
 
-    typedef struct {
-        Py_intptr_t *ptr;
-        int len;
-    } PyArray_Dims;
+  typedef struct {
+    Py_intptr_t *ptr;
+    int len;
+  } PyArray_Dims;
 
-    static npy_api& get() {
-        static npy_api api = lookup();
-        return api;
-    }
+  static npy_api &get() {
+    static npy_api api = lookup();
+    return api;
+  }
 
-    bool PyArray_Check_(PyObject *obj) const {
-        return (bool) PyObject_TypeCheck(obj, PyArray_Type_);
-    }
-    bool PyArrayDescr_Check_(PyObject *obj) const {
-        return (bool) PyObject_TypeCheck(obj, PyArrayDescr_Type_);
-    }
+  bool PyArray_Check_(PyObject *obj) const {
+    return (bool)PyObject_TypeCheck(obj, PyArray_Type_);
+  }
+  bool PyArrayDescr_Check_(PyObject *obj) const {
+    return (bool)PyObject_TypeCheck(obj, PyArrayDescr_Type_);
+  }
+
+  unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
+  PyObject *(*PyArray_DescrFromType_)(int);
+  PyObject *(*PyArray_NewFromDescr_)(PyTypeObject *, PyObject *, int,
+                                     Py_intptr_t *, Py_intptr_t *, void *, int,
+                                     PyObject *);
+  PyObject *(*PyArray_DescrNewFromType_)(int);
+  int (*PyArray_CopyInto_)(PyObject *, PyObject *);
+  PyObject *(*PyArray_NewCopy_)(PyObject *, int);
+  PyTypeObject *PyArray_Type_;
+  PyTypeObject *PyVoidArrType_Type_;
+  PyTypeObject *PyArrayDescr_Type_;
+  PyObject *(*PyArray_DescrFromScalar_)(PyObject *);
+  PyObject *(*PyArray_FromAny_)(PyObject *, PyObject *, int, int, int,
+                                PyObject *);
+  int (*PyArray_DescrConverter_)(PyObject *, PyObject **);
+  bool (*PyArray_EquivTypes_)(PyObject *, PyObject *);
+  int (*PyArray_GetArrayParamsFromObject_)(PyObject *, PyObject *, char,
+                                           PyObject **, int *, Py_ssize_t *,
+                                           PyObject **, PyObject *);
+  PyObject *(*PyArray_Squeeze_)(PyObject *);
+  int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
+  PyObject *(*PyArray_Resize_)(PyObject *, PyArray_Dims *, int, int);
 
-    unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
-    PyObject *(*PyArray_DescrFromType_)(int);
-    PyObject *(*PyArray_NewFromDescr_)
-        (PyTypeObject *, PyObject *, int, Py_intptr_t *,
-         Py_intptr_t *, void *, int, PyObject *);
-    PyObject *(*PyArray_DescrNewFromType_)(int);
-    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
-    PyObject *(*PyArray_NewCopy_)(PyObject *, int);
-    PyTypeObject *PyArray_Type_;
-    PyTypeObject *PyVoidArrType_Type_;
-    PyTypeObject *PyArrayDescr_Type_;
-    PyObject *(*PyArray_DescrFromScalar_)(PyObject *);
-    PyObject *(*PyArray_FromAny_) (PyObject *, PyObject *, int, int, int, PyObject *);
-    int (*PyArray_DescrConverter_) (PyObject *, PyObject **);
-    bool (*PyArray_EquivTypes_) (PyObject *, PyObject *);
-    int (*PyArray_GetArrayParamsFromObject_)(PyObject *, PyObject *, char, PyObject **, int *,
-                                             Py_ssize_t *, PyObject **, PyObject *);
-    PyObject *(*PyArray_Squeeze_)(PyObject *);
-    int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
-    PyObject* (*PyArray_Resize_)(PyObject*, PyArray_Dims*, int, int);
 private:
-    enum functions {
-        API_PyArray_GetNDArrayCFeatureVersion = 211,
-        API_PyArray_Type = 2,
-        API_PyArrayDescr_Type = 3,
-        API_PyVoidArrType_Type = 39,
-        API_PyArray_DescrFromType = 45,
-        API_PyArray_DescrFromScalar = 57,
-        API_PyArray_FromAny = 69,
-        API_PyArray_Resize = 80,
-        API_PyArray_CopyInto = 82,
-        API_PyArray_NewCopy = 85,
-        API_PyArray_NewFromDescr = 94,
-        API_PyArray_DescrNewFromType = 9,
-        API_PyArray_DescrConverter = 174,
-        API_PyArray_EquivTypes = 182,
-        API_PyArray_GetArrayParamsFromObject = 278,
-        API_PyArray_Squeeze = 136,
-        API_PyArray_SetBaseObject = 282
-    };
+  enum functions {
+    API_PyArray_GetNDArrayCFeatureVersion = 211,
+    API_PyArray_Type = 2,
+    API_PyArrayDescr_Type = 3,
+    API_PyVoidArrType_Type = 39,
+    API_PyArray_DescrFromType = 45,
+    API_PyArray_DescrFromScalar = 57,
+    API_PyArray_FromAny = 69,
+    API_PyArray_Resize = 80,
+    API_PyArray_CopyInto = 82,
+    API_PyArray_NewCopy = 85,
+    API_PyArray_NewFromDescr = 94,
+    API_PyArray_DescrNewFromType = 9,
+    API_PyArray_DescrConverter = 174,
+    API_PyArray_EquivTypes = 182,
+    API_PyArray_GetArrayParamsFromObject = 278,
+    API_PyArray_Squeeze = 136,
+    API_PyArray_SetBaseObject = 282
+  };
 
-    static npy_api lookup() {
-        module m = module::import("numpy.core.multiarray");
-        auto c = m.attr("_ARRAY_API");
+  static npy_api lookup() {
+    module m = module::import("numpy.core.multiarray");
+    auto c = m.attr("_ARRAY_API");
 #if PY_MAJOR_VERSION >= 3
-        void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), NULL);
+    void **api_ptr = (void **)PyCapsule_GetPointer(c.ptr(), NULL);
 #else
-        void **api_ptr = (void **) PyCObject_AsVoidPtr(c.ptr());
+    void **api_ptr = (void **)PyCObject_AsVoidPtr(c.ptr());
 #endif
-        npy_api api;
-#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func];
-        DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
-        if (api.PyArray_GetNDArrayCFeatureVersion_() < 0x7)
-            pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
-        DECL_NPY_API(PyArray_Type);
-        DECL_NPY_API(PyVoidArrType_Type);
-        DECL_NPY_API(PyArrayDescr_Type);
-        DECL_NPY_API(PyArray_DescrFromType);
-        DECL_NPY_API(PyArray_DescrFromScalar);
-        DECL_NPY_API(PyArray_FromAny);
-        DECL_NPY_API(PyArray_Resize);
-        DECL_NPY_API(PyArray_CopyInto);
-        DECL_NPY_API(PyArray_NewCopy);
-        DECL_NPY_API(PyArray_NewFromDescr);
-        DECL_NPY_API(PyArray_DescrNewFromType);
-        DECL_NPY_API(PyArray_DescrConverter);
-        DECL_NPY_API(PyArray_EquivTypes);
-        DECL_NPY_API(PyArray_GetArrayParamsFromObject);
-        DECL_NPY_API(PyArray_Squeeze);
-        DECL_NPY_API(PyArray_SetBaseObject);
+    npy_api api;
+#define DECL_NPY_API(Func)                                                     \
+  api.Func##_ = (decltype(api.Func##_))api_ptr[API_##Func];
+    DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
+    if (api.PyArray_GetNDArrayCFeatureVersion_() < 0x7)
+      pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
+    DECL_NPY_API(PyArray_Type);
+    DECL_NPY_API(PyVoidArrType_Type);
+    DECL_NPY_API(PyArrayDescr_Type);
+    DECL_NPY_API(PyArray_DescrFromType);
+    DECL_NPY_API(PyArray_DescrFromScalar);
+    DECL_NPY_API(PyArray_FromAny);
+    DECL_NPY_API(PyArray_Resize);
+    DECL_NPY_API(PyArray_CopyInto);
+    DECL_NPY_API(PyArray_NewCopy);
+    DECL_NPY_API(PyArray_NewFromDescr);
+    DECL_NPY_API(PyArray_DescrNewFromType);
+    DECL_NPY_API(PyArray_DescrConverter);
+    DECL_NPY_API(PyArray_EquivTypes);
+    DECL_NPY_API(PyArray_GetArrayParamsFromObject);
+    DECL_NPY_API(PyArray_Squeeze);
+    DECL_NPY_API(PyArray_SetBaseObject);
 #undef DECL_NPY_API
-        return api;
-    }
+    return api;
+  }
 };
 
-inline PyArray_Proxy* array_proxy(void* ptr) {
-    return reinterpret_cast<PyArray_Proxy*>(ptr);
+inline PyArray_Proxy *array_proxy(void *ptr) {
+  return reinterpret_cast<PyArray_Proxy *>(ptr);
 }
 
-inline const PyArray_Proxy* array_proxy(const void* ptr) {
-    return reinterpret_cast<const PyArray_Proxy*>(ptr);
+inline const PyArray_Proxy *array_proxy(const void *ptr) {
+  return reinterpret_cast<const PyArray_Proxy *>(ptr);
 }
 
-inline PyArrayDescr_Proxy* array_descriptor_proxy(PyObject* ptr) {
-   return reinterpret_cast<PyArrayDescr_Proxy*>(ptr);
+inline PyArrayDescr_Proxy *array_descriptor_proxy(PyObject *ptr) {
+  return reinterpret_cast<PyArrayDescr_Proxy *>(ptr);
 }
 
-inline const PyArrayDescr_Proxy* array_descriptor_proxy(const PyObject* ptr) {
-   return reinterpret_cast<const PyArrayDescr_Proxy*>(ptr);
+inline const PyArrayDescr_Proxy *array_descriptor_proxy(const PyObject *ptr) {
+  return reinterpret_cast<const PyArrayDescr_Proxy *>(ptr);
 }
 
-inline bool check_flags(const void* ptr, int flag) {
-    return (flag == (array_proxy(ptr)->flags & flag));
+inline bool check_flags(const void *ptr, int flag) {
+  return (flag == (array_proxy(ptr)->flags & flag));
 }
 
-template <typename T> struct is_std_array : std::false_type { };
-template <typename T, size_t N> struct is_std_array<std::array<T, N>> : std::true_type { };
-template <typename T> struct is_complex : std::false_type { };
-template <typename T> struct is_complex<std::complex<T>> : std::true_type { };
+template <typename T> struct is_std_array : std::false_type {};
+template <typename T, size_t N>
+struct is_std_array<std::array<T, N>> : std::true_type {};
+template <typename T> struct is_complex : std::false_type {};
+template <typename T> struct is_complex<std::complex<T>> : std::true_type {};
 
 template <typename T> struct array_info_scalar {
-    typedef T type;
-    static constexpr bool is_array = false;
-    static constexpr bool is_empty = false;
-    static constexpr auto extents = _("");
-    static void append_extents(list& /* shape */) { }
+  typedef T type;
+  static constexpr bool is_array = false;
+  static constexpr bool is_empty = false;
+  static constexpr auto extents = _("");
+  static void append_extents(list & /* shape */) {}
 };
 // Computes underlying type and a comma-separated list of extents for array
 // types (any mix of std::array and built-in arrays). An array of char is
 // treated as scalar because it gets special handling.
-template <typename T> struct array_info : array_info_scalar<T> { };
+template <typename T> struct array_info : array_info_scalar<T> {};
 template <typename T, size_t N> struct array_info<std::array<T, N>> {
-    using type = typename array_info<T>::type;
-    static constexpr bool is_array = true;
-    static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
-    static constexpr size_t extent = N;
+  using type = typename array_info<T>::type;
+  static constexpr bool is_array = true;
+  static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
+  static constexpr size_t extent = N;
 
-    // appends the extents to shape
-    static void append_extents(list& shape) {
-        shape.append(N);
-        array_info<T>::append_extents(shape);
-    }
+  // appends the extents to shape
+  static void append_extents(list &shape) {
+    shape.append(N);
+    array_info<T>::append_extents(shape);
+  }
 
-    static constexpr auto extents = _<array_info<T>::is_array>(
-        concat(_<N>(), array_info<T>::extents), _<N>()
-    );
+  static constexpr auto extents = _<array_info<T>::is_array>(
+      concat(_<N>(), array_info<T>::extents), _<N>());
 };
-// For numpy we have special handling for arrays of characters, so we don't include
-// the size in the array extents.
-template <size_t N> struct array_info<char[N]> : array_info_scalar<char[N]> { };
-template <size_t N> struct array_info<std::array<char, N>> : array_info_scalar<std::array<char, N>> { };
-template <typename T, size_t N> struct array_info<T[N]> : array_info<std::array<T, N>> { };
+// For numpy we have special handling for arrays of characters, so we don't
+// include the size in the array extents.
+template <size_t N> struct array_info<char[N]> : array_info_scalar<char[N]> {};
+template <size_t N>
+struct array_info<std::array<char, N>>
+    : array_info_scalar<std::array<char, N>> {};
+template <typename T, size_t N>
+struct array_info<T[N]> : array_info<std::array<T, N>> {};
 template <typename T> using remove_all_extents_t = typename array_info<T>::type;
 
-template <typename T> using is_pod_struct = all_of<
-    std::is_standard_layout<T>,     // since we're accessing directly in memory we need a standard layout type
-#if !defined(__GNUG__) || defined(_LIBCPP_VERSION) || defined(_GLIBCXX_USE_CXX11_ABI)
-    // _GLIBCXX_USE_CXX11_ABI indicates that we're using libstdc++ from GCC 5 or newer, independent
-    // of the actual compiler (Clang can also use libstdc++, but it always defines __GNUC__ == 4).
-    std::is_trivially_copyable<T>,
+template <typename T>
+using is_pod_struct =
+    all_of<std::is_standard_layout<T>, // since we're accessing directly in
+                                       // memory we need a standard layout type
+#if !defined(__GNUG__) || defined(_LIBCPP_VERSION) ||                          \
+    defined(_GLIBCXX_USE_CXX11_ABI)
+           // _GLIBCXX_USE_CXX11_ABI indicates that we're using libstdc++ from
+           // GCC 5 or newer, independent of the actual compiler (Clang can also
+           // use libstdc++, but it always defines __GNUC__ == 4).
+           std::is_trivially_copyable<T>,
 #else
-    // GCC 4 doesn't implement is_trivially_copyable, so approximate it
-    std::is_trivially_destructible<T>,
-    satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+           // GCC 4 doesn't implement is_trivially_copyable, so approximate it
+           std::is_trivially_destructible<T>,
+           satisfies_any_of<T, std::has_trivial_copy_constructor,
+                            std::has_trivial_copy_assign>,
 #endif
-    satisfies_none_of<T, std::is_reference, std::is_array, is_std_array, std::is_arithmetic, is_complex, std::is_enum>
->;
+           satisfies_none_of<T, std::is_reference, std::is_array, is_std_array,
+                             std::is_arithmetic, is_complex, std::is_enum>>;
 
-template <ssize_t Dim = 0, typename Strides> ssize_t byte_offset_unsafe(const Strides &) { return 0; }
+template <ssize_t Dim = 0, typename Strides>
+ssize_t byte_offset_unsafe(const Strides &) {
+  return 0;
+}
 template <ssize_t Dim = 0, typename Strides, typename... Ix>
 ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
-    return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
+  return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
 }
 
 /**
- * Proxy class providing unsafe, unchecked const access to array data.  This is constructed through
- * the `unchecked<T, N>()` method of `array` or the `unchecked<N>()` method of `array_t<T>`.  `Dims`
- * will be -1 for dimensions determined at runtime.
+ * Proxy class providing unsafe, unchecked const access to array data.  This is
+ * constructed through the `unchecked<T, N>()` method of `array` or the
+ * `unchecked<N>()` method of `array_t<T>`.  `Dims` will be -1 for dimensions
+ * determined at runtime.
  */
-template <typename T, ssize_t Dims>
-class unchecked_reference {
+template <typename T, ssize_t Dims> class unchecked_reference {
 protected:
-    static constexpr bool Dynamic = Dims < 0;
-    const unsigned char *data_;
-    // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to
-    // make large performance gains on big, nested loops, but requires compile-time dimensions
-    conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t) Dims>>
-            shape_, strides_;
-    const ssize_t dims_;
+  static constexpr bool Dynamic = Dims < 0;
+  const unsigned char *data_;
+  // Storing the shape & strides in local variables (i.e. these arrays) allows
+  // the compiler to make large performance gains on big, nested loops, but
+  // requires compile-time dimensions
+  conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t)Dims>>
+      shape_, strides_;
+  const ssize_t dims_;
 
-    friend class pybind11::array;
-    // Constructor for compile-time dimensions:
-    template <bool Dyn = Dynamic>
-    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<!Dyn, ssize_t>)
-    : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
-        for (size_t i = 0; i < (size_t) dims_; i++) {
-            shape_[i] = shape[i];
-            strides_[i] = strides[i];
-        }
+  friend class pybind11::array;
+  // Constructor for compile-time dimensions:
+  template <bool Dyn = Dynamic>
+  unchecked_reference(const void *data, const ssize_t *shape,
+                      const ssize_t *strides, enable_if_t<!Dyn, ssize_t>)
+      : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
+    for (size_t i = 0; i < (size_t)dims_; i++) {
+      shape_[i] = shape[i];
+      strides_[i] = strides[i];
     }
-    // Constructor for runtime dimensions:
-    template <bool Dyn = Dynamic>
-    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<Dyn, ssize_t> dims)
-    : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape}, strides_{strides}, dims_{dims} {}
+  }
+  // Constructor for runtime dimensions:
+  template <bool Dyn = Dynamic>
+  unchecked_reference(const void *data, const ssize_t *shape,
+                      const ssize_t *strides, enable_if_t<Dyn, ssize_t> dims)
+      : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape},
+        strides_{strides}, dims_{dims} {}
 
 public:
-    /**
-     * Unchecked const reference access to data at the given indices.  For a compile-time known
-     * number of dimensions, this requires the correct number of arguments; for run-time
-     * dimensionality, this is not checked (and so is up to the caller to use safely).
-     */
-    template <typename... Ix> const T &operator()(Ix... index) const {
-        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
-                "Invalid number of indices for unchecked array reference");
-        return *reinterpret_cast<const T *>(data_ + byte_offset_unsafe(strides_, ssize_t(index)...));
-    }
-    /**
-     * Unchecked const reference access to data; this operator only participates if the reference
-     * is to a 1-dimensional array.  When present, this is exactly equivalent to `obj(index)`.
-     */
-    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
-    const T &operator[](ssize_t index) const { return operator()(index); }
+  /**
+   * Unchecked const reference access to data at the given indices.  For a
+   * compile-time known number of dimensions, this requires the correct number
+   * of arguments; for run-time dimensionality, this is not checked (and so is
+   * up to the caller to use safely).
+   */
+  template <typename... Ix> const T &operator()(Ix... index) const {
+    static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                  "Invalid number of indices for unchecked array reference");
+    return *reinterpret_cast<const T *>(
+        data_ + byte_offset_unsafe(strides_, ssize_t(index)...));
+  }
+  /**
+   * Unchecked const reference access to data; this operator only participates
+   * if the reference is to a 1-dimensional array.  When present, this is
+   * exactly equivalent to `obj(index)`.
+   */
+  template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+  const T &operator[](ssize_t index) const {
+    return operator()(index);
+  }
 
-    /// Pointer access to the data at the given indices.
-    template <typename... Ix> const T *data(Ix... ix) const { return &operator()(ssize_t(ix)...); }
+  /// Pointer access to the data at the given indices.
+  template <typename... Ix> const T *data(Ix... ix) const {
+    return &operator()(ssize_t(ix)...);
+  }
 
-    /// Returns the item size, i.e. sizeof(T)
-    constexpr static ssize_t itemsize() { return sizeof(T); }
+  /// Returns the item size, i.e. sizeof(T)
+  constexpr static ssize_t itemsize() { return sizeof(T); }
 
-    /// Returns the shape (i.e. size) of dimension `dim`
-    ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; }
+  /// Returns the shape (i.e. size) of dimension `dim`
+  ssize_t shape(ssize_t dim) const { return shape_[(size_t)dim]; }
 
-    /// Returns the number of dimensions of the array
-    ssize_t ndim() const { return dims_; }
+  /// Returns the number of dimensions of the array
+  ssize_t ndim() const { return dims_; }
 
-    /// Returns the total number of elements in the referenced array, i.e. the product of the shapes
-    template <bool Dyn = Dynamic>
-    enable_if_t<!Dyn, ssize_t> size() const {
-        return std::accumulate(shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies<ssize_t>());
-    }
-    template <bool Dyn = Dynamic>
-    enable_if_t<Dyn, ssize_t> size() const {
-        return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
-    }
+  /// Returns the total number of elements in the referenced array, i.e. the
+  /// product of the shapes
+  template <bool Dyn = Dynamic> enable_if_t<!Dyn, ssize_t> size() const {
+    return std::accumulate(shape_.begin(), shape_.end(), (ssize_t)1,
+                           std::multiplies<ssize_t>());
+  }
+  template <bool Dyn = Dynamic> enable_if_t<Dyn, ssize_t> size() const {
+    return std::accumulate(shape_, shape_ + ndim(), (ssize_t)1,
+                           std::multiplies<ssize_t>());
+  }
 
-    /// Returns the total number of bytes used by the referenced data.  Note that the actual span in
-    /// memory may be larger if the referenced array has non-contiguous strides (e.g. for a slice).
-    ssize_t nbytes() const {
-        return size() * itemsize();
-    }
+  /// Returns the total number of bytes used by the referenced data.  Note that
+  /// the actual span in memory may be larger if the referenced array has
+  /// non-contiguous strides (e.g. for a slice).
+  ssize_t nbytes() const { return size() * itemsize(); }
 };
 
 template <typename T, ssize_t Dims>
 class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
-    friend class pybind11::array;
-    using ConstBase = unchecked_reference<T, Dims>;
-    using ConstBase::ConstBase;
-    using ConstBase::Dynamic;
-public:
-    /// Mutable, unchecked access to data at the given indices.
-    template <typename... Ix> T& operator()(Ix... index) {
-        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
-                "Invalid number of indices for unchecked array reference");
-        return const_cast<T &>(ConstBase::operator()(index...));
-    }
-    /**
-     * Mutable, unchecked access data at the given index; this operator only participates if the
-     * reference is to a 1-dimensional array (or has runtime dimensions).  When present, this is
-     * exactly equivalent to `obj(index)`.
-     */
-    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
-    T &operator[](ssize_t index) { return operator()(index); }
+  friend class pybind11::array;
+  using ConstBase = unchecked_reference<T, Dims>;
+  using ConstBase::ConstBase;
+  using ConstBase::Dynamic;
 
-    /// Mutable pointer access to the data at the given indices.
-    template <typename... Ix> T *mutable_data(Ix... ix) { return &operator()(ssize_t(ix)...); }
+public:
+  /// Mutable, unchecked access to data at the given indices.
+  template <typename... Ix> T &operator()(Ix... index) {
+    static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                  "Invalid number of indices for unchecked array reference");
+    return const_cast<T &>(ConstBase::operator()(index...));
+  }
+  /**
+   * Mutable, unchecked access data at the given index; this operator only
+   * participates if the reference is to a 1-dimensional array (or has runtime
+   * dimensions).  When present, this is exactly equivalent to `obj(index)`.
+   */
+  template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+  T &operator[](ssize_t index) {
+    return operator()(index);
+  }
+
+  /// Mutable pointer access to the data at the given indices.
+  template <typename... Ix> T *mutable_data(Ix... ix) {
+    return &operator()(ssize_t(ix)...);
+  }
 };
 
 template <typename T, ssize_t Dim>
 struct type_caster<unchecked_reference<T, Dim>> {
-    static_assert(Dim == 0 && Dim > 0 /* always fail */, "unchecked array proxy object is not castable");
+  static_assert(Dim == 0 && Dim > 0 /* always fail */,
+                "unchecked array proxy object is not castable");
 };
 template <typename T, ssize_t Dim>
-struct type_caster<unchecked_mutable_reference<T, Dim>> : type_caster<unchecked_reference<T, Dim>> {};
+struct type_caster<unchecked_mutable_reference<T, Dim>>
+    : type_caster<unchecked_reference<T, Dim>> {};
 
 NAMESPACE_END(detail)
 
 class dtype : public object {
 public:
-    PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_);
+  PYBIND11_OBJECT_DEFAULT(dtype, object,
+                          detail::npy_api::get().PyArrayDescr_Check_);
 
-    explicit dtype(const buffer_info &info) {
-        dtype descr(_dtype_from_pep3118()(PYBIND11_STR_TYPE(info.format)));
-        // If info.itemsize == 0, use the value calculated from the format string
-        m_ptr = descr.strip_padding(info.itemsize ? info.itemsize : descr.itemsize()).release().ptr();
-    }
+  explicit dtype(const buffer_info &info) {
+    dtype descr(_dtype_from_pep3118()(PYBIND11_STR_TYPE(info.format)));
+    // If info.itemsize == 0, use the value calculated from the format string
+    m_ptr =
+        descr.strip_padding(info.itemsize ? info.itemsize : descr.itemsize())
+            .release()
+            .ptr();
+  }
 
-    explicit dtype(const std::string &format) {
-        m_ptr = from_args(pybind11::str(format)).release().ptr();
-    }
+  explicit dtype(const std::string &format) {
+    m_ptr = from_args(pybind11::str(format)).release().ptr();
+  }
 
-    dtype(const char *format) : dtype(std::string(format)) { }
+  dtype(const char *format) : dtype(std::string(format)) {}
 
-    dtype(list names, list formats, list offsets, ssize_t itemsize) {
-        dict args;
-        args["names"] = names;
-        args["formats"] = formats;
-        args["offsets"] = offsets;
-        args["itemsize"] = pybind11::int_(itemsize);
-        m_ptr = from_args(args).release().ptr();
-    }
+  dtype(list names, list formats, list offsets, ssize_t itemsize) {
+    dict args;
+    args["names"] = names;
+    args["formats"] = formats;
+    args["offsets"] = offsets;
+    args["itemsize"] = pybind11::int_(itemsize);
+    m_ptr = from_args(args).release().ptr();
+  }
 
-    /// This is essentially the same as calling numpy.dtype(args) in Python.
-    static dtype from_args(object args) {
-        PyObject *ptr = nullptr;
-        if (!detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) || !ptr)
-            throw error_already_set();
-        return reinterpret_steal<dtype>(ptr);
-    }
+  /// This is essentially the same as calling numpy.dtype(args) in Python.
+  static dtype from_args(object args) {
+    PyObject *ptr = nullptr;
+    if (!detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) ||
+        !ptr)
+      throw error_already_set();
+    return reinterpret_steal<dtype>(ptr);
+  }
 
-    /// Return dtype associated with a C++ type.
-    template <typename T> static dtype of() {
-        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::dtype();
-    }
+  /// Return dtype associated with a C++ type.
+  template <typename T> static dtype of() {
+    return detail::npy_format_descriptor<
+        typename std::remove_cv<T>::type>::dtype();
+  }
 
-    /// Size of the data type in bytes.
-    ssize_t itemsize() const {
-        return detail::array_descriptor_proxy(m_ptr)->elsize;
-    }
+  /// Size of the data type in bytes.
+  ssize_t itemsize() const {
+    return detail::array_descriptor_proxy(m_ptr)->elsize;
+  }
 
-    /// Returns true for structured data types.
-    bool has_fields() const {
-        return detail::array_descriptor_proxy(m_ptr)->names != nullptr;
-    }
+  /// Returns true for structured data types.
+  bool has_fields() const {
+    return detail::array_descriptor_proxy(m_ptr)->names != nullptr;
+  }
 
-    /// Single-character type code.
-    char kind() const {
-        return detail::array_descriptor_proxy(m_ptr)->kind;
-    }
+  /// Single-character type code.
+  char kind() const { return detail::array_descriptor_proxy(m_ptr)->kind; }
 
 private:
-    static object _dtype_from_pep3118() {
-        static PyObject *obj = module::import("numpy.core._internal")
-            .attr("_dtype_from_pep3118").cast<object>().release().ptr();
-        return reinterpret_borrow<object>(obj);
+  static object _dtype_from_pep3118() {
+    static PyObject *obj = module::import("numpy.core._internal")
+                               .attr("_dtype_from_pep3118")
+                               .cast<object>()
+                               .release()
+                               .ptr();
+    return reinterpret_borrow<object>(obj);
+  }
+
+  dtype strip_padding(ssize_t itemsize) {
+    // Recursively strip all void fields with empty names that are generated for
+    // padding fields (as of NumPy v1.11).
+    if (!has_fields())
+      return *this;
+
+    struct field_descr {
+      PYBIND11_STR_TYPE name;
+      object format;
+      pybind11::int_ offset;
+    };
+    std::vector<field_descr> field_descriptors;
+
+    for (auto field : attr("fields").attr("items")()) {
+      auto spec = field.cast<tuple>();
+      auto name = spec[0].cast<pybind11::str>();
+      auto format = spec[1].cast<tuple>()[0].cast<dtype>();
+      auto offset = spec[1].cast<tuple>()[1].cast<pybind11::int_>();
+      if (!len(name) && format.kind() == 'V')
+        continue;
+      field_descriptors.push_back({(PYBIND11_STR_TYPE)name,
+                                   format.strip_padding(format.itemsize()),
+                                   offset});
     }
 
-    dtype strip_padding(ssize_t itemsize) {
-        // Recursively strip all void fields with empty names that are generated for
-        // padding fields (as of NumPy v1.11).
-        if (!has_fields())
-            return *this;
+    std::sort(field_descriptors.begin(), field_descriptors.end(),
+              [](const field_descr &a, const field_descr &b) {
+                return a.offset.cast<int>() < b.offset.cast<int>();
+              });
 
-        struct field_descr { PYBIND11_STR_TYPE name; object format; pybind11::int_ offset; };
-        std::vector<field_descr> field_descriptors;
-
-        for (auto field : attr("fields").attr("items")()) {
-            auto spec = field.cast<tuple>();
-            auto name = spec[0].cast<pybind11::str>();
-            auto format = spec[1].cast<tuple>()[0].cast<dtype>();
-            auto offset = spec[1].cast<tuple>()[1].cast<pybind11::int_>();
-            if (!len(name) && format.kind() == 'V')
-                continue;
-            field_descriptors.push_back({(PYBIND11_STR_TYPE) name, format.strip_padding(format.itemsize()), offset});
-        }
-
-        std::sort(field_descriptors.begin(), field_descriptors.end(),
-                  [](const field_descr& a, const field_descr& b) {
-                      return a.offset.cast<int>() < b.offset.cast<int>();
-                  });
-
-        list names, formats, offsets;
-        for (auto& descr : field_descriptors) {
-            names.append(descr.name);
-            formats.append(descr.format);
-            offsets.append(descr.offset);
-        }
-        return dtype(names, formats, offsets, itemsize);
+    list names, formats, offsets;
+    for (auto &descr : field_descriptors) {
+      names.append(descr.name);
+      formats.append(descr.format);
+      offsets.append(descr.offset);
     }
+    return dtype(names, formats, offsets, itemsize);
+  }
 };
 
 class array : public buffer {
 public:
-    PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array)
+  PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_,
+                      raw_array)
 
-    enum {
-        c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
-        f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
-        forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
-    };
+  enum {
+    c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
+    f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
+    forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
+  };
 
-    array() : array({{0}}, static_cast<const double *>(nullptr)) {}
+  array() : array({{0}}, static_cast<const double *>(nullptr)) {}
 
-    using ShapeContainer = detail::any_container<ssize_t>;
-    using StridesContainer = detail::any_container<ssize_t>;
+  using ShapeContainer = detail::any_container<ssize_t>;
+  using StridesContainer = detail::any_container<ssize_t>;
 
-    // Constructs an array taking shape/strides from arbitrary container types
-    array(const pybind11::dtype &dt, ShapeContainer shape, StridesContainer strides,
-          const void *ptr = nullptr, handle base = handle()) {
+  // Constructs an array taking shape/strides from arbitrary container types
+  array(const pybind11::dtype &dt, ShapeContainer shape,
+        StridesContainer strides, const void *ptr = nullptr,
+        handle base = handle()) {
 
-        if (strides->empty())
-            *strides = c_strides(*shape, dt.itemsize());
+    if (strides->empty())
+      *strides = c_strides(*shape, dt.itemsize());
 
-        auto ndim = shape->size();
-        if (ndim != strides->size())
-            pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
-        auto descr = dt;
+    auto ndim = shape->size();
+    if (ndim != strides->size())
+      pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
+    auto descr = dt;
 
-        int flags = 0;
-        if (base && ptr) {
-            if (isinstance<array>(base))
-                /* Copy flags from base (except ownership bit) */
-                flags = reinterpret_borrow<array>(base).flags() & ~detail::npy_api::NPY_ARRAY_OWNDATA_;
-            else
-                /* Writable by default, easy to downgrade later on if needed */
-                flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
-        }
-
-        auto &api = detail::npy_api::get();
-        auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
-            api.PyArray_Type_, descr.release().ptr(), (int) ndim, shape->data(), strides->data(),
-            const_cast<void *>(ptr), flags, nullptr));
-        if (!tmp)
-            throw error_already_set();
-        if (ptr) {
-            if (base) {
-                api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
-            } else {
-                tmp = reinterpret_steal<object>(api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
-            }
-        }
-        m_ptr = tmp.release().ptr();
+    int flags = 0;
+    if (base && ptr) {
+      if (isinstance<array>(base))
+        /* Copy flags from base (except ownership bit) */
+        flags = reinterpret_borrow<array>(base).flags() &
+                ~detail::npy_api::NPY_ARRAY_OWNDATA_;
+      else
+        /* Writable by default, easy to downgrade later on if needed */
+        flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
     }
 
-    array(const pybind11::dtype &dt, ShapeContainer shape, const void *ptr = nullptr, handle base = handle())
-        : array(dt, std::move(shape), {}, ptr, base) { }
-
-    template <typename T, typename = detail::enable_if_t<std::is_integral<T>::value && !std::is_same<bool, T>::value>>
-    array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle())
-        : array(dt, {{count}}, ptr, base) { }
-
-    template <typename T>
-    array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle())
-        : array(pybind11::dtype::of<T>(), std::move(shape), std::move(strides), ptr, base) { }
-
-    template <typename T>
-    array(ShapeContainer shape, const T *ptr, handle base = handle())
-        : array(std::move(shape), {}, ptr, base) { }
-
-    template <typename T>
-    explicit array(ssize_t count, const T *ptr, handle base = handle()) : array({count}, {}, ptr, base) { }
-
-    explicit array(const buffer_info &info)
-    : array(pybind11::dtype(info), info.shape, info.strides, info.ptr) { }
-
-    /// Array descriptor (dtype)
-    pybind11::dtype dtype() const {
-        return reinterpret_borrow<pybind11::dtype>(detail::array_proxy(m_ptr)->descr);
+    auto &api = detail::npy_api::get();
+    auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
+        api.PyArray_Type_, descr.release().ptr(), (int)ndim, shape->data(),
+        strides->data(), const_cast<void *>(ptr), flags, nullptr));
+    if (!tmp)
+      throw error_already_set();
+    if (ptr) {
+      if (base) {
+        api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
+      } else {
+        tmp = reinterpret_steal<object>(
+            api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
+      }
     }
+    m_ptr = tmp.release().ptr();
+  }
 
-    /// Total number of elements
-    ssize_t size() const {
-        return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+  array(const pybind11::dtype &dt, ShapeContainer shape,
+        const void *ptr = nullptr, handle base = handle())
+      : array(dt, std::move(shape), {}, ptr, base) {}
+
+  template <typename T,
+            typename = detail::enable_if_t<std::is_integral<T>::value &&
+                                           !std::is_same<bool, T>::value>>
+  array(const pybind11::dtype &dt, T count, const void *ptr = nullptr,
+        handle base = handle())
+      : array(dt, {{count}}, ptr, base) {}
+
+  template <typename T>
+  array(ShapeContainer shape, StridesContainer strides, const T *ptr,
+        handle base = handle())
+      : array(pybind11::dtype::of<T>(), std::move(shape), std::move(strides),
+              ptr, base) {}
+
+  template <typename T>
+  array(ShapeContainer shape, const T *ptr, handle base = handle())
+      : array(std::move(shape), {}, ptr, base) {}
+
+  template <typename T>
+  explicit array(ssize_t count, const T *ptr, handle base = handle())
+      : array({count}, {}, ptr, base) {}
+
+  explicit array(const buffer_info &info)
+      : array(pybind11::dtype(info), info.shape, info.strides, info.ptr) {}
+
+  /// Array descriptor (dtype)
+  pybind11::dtype dtype() const {
+    return reinterpret_borrow<pybind11::dtype>(
+        detail::array_proxy(m_ptr)->descr);
+  }
+
+  /// Total number of elements
+  ssize_t size() const {
+    return std::accumulate(shape(), shape() + ndim(), (ssize_t)1,
+                           std::multiplies<ssize_t>());
+  }
+
+  /// Byte size of a single element
+  ssize_t itemsize() const {
+    return detail::array_descriptor_proxy(detail::array_proxy(m_ptr)->descr)
+        ->elsize;
+  }
+
+  /// Total number of bytes
+  ssize_t nbytes() const { return size() * itemsize(); }
+
+  /// Number of dimensions
+  ssize_t ndim() const { return detail::array_proxy(m_ptr)->nd; }
+
+  /// Base object
+  object base() const {
+    return reinterpret_borrow<object>(detail::array_proxy(m_ptr)->base);
+  }
+
+  /// Dimensions of the array
+  const ssize_t *shape() const {
+    return detail::array_proxy(m_ptr)->dimensions;
+  }
+
+  /// Dimension along a given axis
+  ssize_t shape(ssize_t dim) const {
+    if (dim >= ndim())
+      fail_dim_check(dim, "invalid axis");
+    return shape()[dim];
+  }
+
+  /// Strides of the array
+  const ssize_t *strides() const { return detail::array_proxy(m_ptr)->strides; }
+
+  /// Stride along a given axis
+  ssize_t strides(ssize_t dim) const {
+    if (dim >= ndim())
+      fail_dim_check(dim, "invalid axis");
+    return strides()[dim];
+  }
+
+  /// Return the NumPy array flags
+  int flags() const { return detail::array_proxy(m_ptr)->flags; }
+
+  /// If set, the array is writeable (otherwise the buffer is read-only)
+  bool writeable() const {
+    return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_);
+  }
+
+  /// If set, the array owns the data (will be freed when the array is deleted)
+  bool owndata() const {
+    return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_);
+  }
+
+  /// Pointer to the contained data. If index is not provided, points to the
+  /// beginning of the buffer. May throw if the index would lead to out of
+  /// bounds access.
+  template <typename... Ix> const void *data(Ix... index) const {
+    return static_cast<const void *>(detail::array_proxy(m_ptr)->data +
+                                     offset_at(index...));
+  }
+
+  /// Mutable pointer to the contained data. If index is not provided, points to
+  /// the beginning of the buffer. May throw if the index would lead to out of
+  /// bounds access. May throw if the array is not writeable.
+  template <typename... Ix> void *mutable_data(Ix... index) {
+    check_writeable();
+    return static_cast<void *>(detail::array_proxy(m_ptr)->data +
+                               offset_at(index...));
+  }
+
+  /// Byte offset from beginning of the array to a given index (full or
+  /// partial). May throw if the index would lead to out of bounds access.
+  template <typename... Ix> ssize_t offset_at(Ix... index) const {
+    if ((ssize_t)sizeof...(index) > ndim())
+      fail_dim_check(sizeof...(index), "too many indices for an array");
+    return byte_offset(ssize_t(index)...);
+  }
+
+  ssize_t offset_at() const { return 0; }
+
+  /// Item count from beginning of the array to a given index (full or partial).
+  /// May throw if the index would lead to out of bounds access.
+  template <typename... Ix> ssize_t index_at(Ix... index) const {
+    return offset_at(index...) / itemsize();
+  }
+
+  /**
+   * Returns a proxy object that provides access to the array's data without
+   * bounds or dimensionality checking.  Will throw if the array is missing the
+   * `writeable` flag.  Use with care: the array must not be destroyed or
+   * reshaped for the duration of the returned object, and the caller must take
+   * care not to access invalid dimensions or dimension indices.
+   */
+  template <typename T, ssize_t Dims = -1>
+  detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+    if (Dims >= 0 && ndim() != Dims)
+      throw std::domain_error("array has incorrect number of dimensions: " +
+                              std::to_string(ndim()) + "; expected " +
+                              std::to_string(Dims));
+    return detail::unchecked_mutable_reference<T, Dims>(mutable_data(), shape(),
+                                                        strides(), ndim());
+  }
+
+  /**
+   * Returns a proxy object that provides const access to the array's data
+   * without bounds or dimensionality checking.  Unlike `mutable_unchecked()`,
+   * this does not require that the underlying array have the `writable` flag.
+   * Use with care: the array must not be destroyed or reshaped for the duration
+   * of the returned object, and the caller must take care not to access invalid
+   * dimensions or dimension indices.
+   */
+  template <typename T, ssize_t Dims = -1>
+  detail::unchecked_reference<T, Dims> unchecked() const & {
+    if (Dims >= 0 && ndim() != Dims)
+      throw std::domain_error("array has incorrect number of dimensions: " +
+                              std::to_string(ndim()) + "; expected " +
+                              std::to_string(Dims));
+    return detail::unchecked_reference<T, Dims>(data(), shape(), strides(),
+                                                ndim());
+  }
+
+  /// Return a new view with all of the dimensions of length 1 removed
+  array squeeze() {
+    auto &api = detail::npy_api::get();
+    return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
+  }
+
+  /// Resize array to given shape
+  /// If refcheck is true and more that one reference exist to this array
+  /// then resize will succeed only if it makes a reshape, i.e. original size
+  /// doesn't change
+  void resize(ShapeContainer new_shape, bool refcheck = true) {
+    detail::npy_api::PyArray_Dims d = {new_shape->data(),
+                                       int(new_shape->size())};
+    // try to resize, set ordering param to -1 cause it's not used anyway
+    object new_array = reinterpret_steal<object>(
+        detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1));
+    if (!new_array)
+      throw error_already_set();
+    if (isinstance<array>(new_array)) {
+      *this = std::move(new_array);
     }
+  }
 
-    /// Byte size of a single element
-    ssize_t itemsize() const {
-        return detail::array_descriptor_proxy(detail::array_proxy(m_ptr)->descr)->elsize;
-    }
-
-    /// Total number of bytes
-    ssize_t nbytes() const {
-        return size() * itemsize();
-    }
-
-    /// Number of dimensions
-    ssize_t ndim() const {
-        return detail::array_proxy(m_ptr)->nd;
-    }
-
-    /// Base object
-    object base() const {
-        return reinterpret_borrow<object>(detail::array_proxy(m_ptr)->base);
-    }
-
-    /// Dimensions of the array
-    const ssize_t* shape() const {
-        return detail::array_proxy(m_ptr)->dimensions;
-    }
-
-    /// Dimension along a given axis
-    ssize_t shape(ssize_t dim) const {
-        if (dim >= ndim())
-            fail_dim_check(dim, "invalid axis");
-        return shape()[dim];
-    }
-
-    /// Strides of the array
-    const ssize_t* strides() const {
-        return detail::array_proxy(m_ptr)->strides;
-    }
-
-    /// Stride along a given axis
-    ssize_t strides(ssize_t dim) const {
-        if (dim >= ndim())
-            fail_dim_check(dim, "invalid axis");
-        return strides()[dim];
-    }
-
-    /// Return the NumPy array flags
-    int flags() const {
-        return detail::array_proxy(m_ptr)->flags;
-    }
-
-    /// If set, the array is writeable (otherwise the buffer is read-only)
-    bool writeable() const {
-        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_);
-    }
-
-    /// If set, the array owns the data (will be freed when the array is deleted)
-    bool owndata() const {
-        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_);
-    }
-
-    /// Pointer to the contained data. If index is not provided, points to the
-    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
-    template<typename... Ix> const void* data(Ix... index) const {
-        return static_cast<const void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
-    }
-
-    /// Mutable pointer to the contained data. If index is not provided, points to the
-    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
-    /// May throw if the array is not writeable.
-    template<typename... Ix> void* mutable_data(Ix... index) {
-        check_writeable();
-        return static_cast<void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
-    }
-
-    /// Byte offset from beginning of the array to a given index (full or partial).
-    /// May throw if the index would lead to out of bounds access.
-    template<typename... Ix> ssize_t offset_at(Ix... index) const {
-        if ((ssize_t) sizeof...(index) > ndim())
-            fail_dim_check(sizeof...(index), "too many indices for an array");
-        return byte_offset(ssize_t(index)...);
-    }
-
-    ssize_t offset_at() const { return 0; }
-
-    /// Item count from beginning of the array to a given index (full or partial).
-    /// May throw if the index would lead to out of bounds access.
-    template<typename... Ix> ssize_t index_at(Ix... index) const {
-        return offset_at(index...) / itemsize();
-    }
-
-    /**
-     * Returns a proxy object that provides access to the array's data without bounds or
-     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
-     * care: the array must not be destroyed or reshaped for the duration of the returned object,
-     * and the caller must take care not to access invalid dimensions or dimension indices.
-     */
-    template <typename T, ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
-        if (Dims >= 0 && ndim() != Dims)
-            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
-                    "; expected " + std::to_string(Dims));
-        return detail::unchecked_mutable_reference<T, Dims>(mutable_data(), shape(), strides(), ndim());
-    }
-
-    /**
-     * Returns a proxy object that provides const access to the array's data without bounds or
-     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
-     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed or
-     * reshaped for the duration of the returned object, and the caller must take care not to access
-     * invalid dimensions or dimension indices.
-     */
-    template <typename T, ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
-        if (Dims >= 0 && ndim() != Dims)
-            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
-                    "; expected " + std::to_string(Dims));
-        return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
-    }
-
-    /// Return a new view with all of the dimensions of length 1 removed
-    array squeeze() {
-        auto& api = detail::npy_api::get();
-        return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
-    }
-
-    /// Resize array to given shape
-    /// If refcheck is true and more that one reference exist to this array
-    /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
-    void resize(ShapeContainer new_shape, bool refcheck = true) {
-        detail::npy_api::PyArray_Dims d = {
-            new_shape->data(), int(new_shape->size())
-        };
-        // try to resize, set ordering param to -1 cause it's not used anyway
-        object new_array = reinterpret_steal<object>(
-            detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1)
-        );
-        if (!new_array) throw error_already_set();
-        if (isinstance<array>(new_array)) { *this = std::move(new_array); }
-    }
-
-    /// Ensure that the argument is a NumPy array
-    /// In case of an error, nullptr is returned and the Python error is cleared.
-    static array ensure(handle h, int ExtraFlags = 0) {
-        auto result = reinterpret_steal<array>(raw_array(h.ptr(), ExtraFlags));
-        if (!result)
-            PyErr_Clear();
-        return result;
-    }
+  /// Ensure that the argument is a NumPy array
+  /// In case of an error, nullptr is returned and the Python error is cleared.
+  static array ensure(handle h, int ExtraFlags = 0) {
+    auto result = reinterpret_steal<array>(raw_array(h.ptr(), ExtraFlags));
+    if (!result)
+      PyErr_Clear();
+    return result;
+  }
 
 protected:
-    template<typename, typename> friend struct detail::npy_format_descriptor;
+  template <typename, typename> friend struct detail::npy_format_descriptor;
 
-    void fail_dim_check(ssize_t dim, const std::string& msg) const {
-        throw index_error(msg + ": " + std::to_string(dim) +
-                          " (ndim = " + std::to_string(ndim()) + ")");
+  void fail_dim_check(ssize_t dim, const std::string &msg) const {
+    throw index_error(msg + ": " + std::to_string(dim) +
+                      " (ndim = " + std::to_string(ndim()) + ")");
+  }
+
+  template <typename... Ix> ssize_t byte_offset(Ix... index) const {
+    check_dimensions(index...);
+    return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
+  }
+
+  void check_writeable() const {
+    if (!writeable())
+      throw std::domain_error("array is not writeable");
+  }
+
+  // Default, C-style strides
+  static std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape,
+                                        ssize_t itemsize) {
+    auto ndim = shape.size();
+    std::vector<ssize_t> strides(ndim, itemsize);
+    if (ndim > 0)
+      for (size_t i = ndim - 1; i > 0; --i)
+        strides[i - 1] = strides[i] * shape[i];
+    return strides;
+  }
+
+  // F-style strides; default when constructing an array_t with `ExtraFlags &
+  // f_style`
+  static std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape,
+                                        ssize_t itemsize) {
+    auto ndim = shape.size();
+    std::vector<ssize_t> strides(ndim, itemsize);
+    for (size_t i = 1; i < ndim; ++i)
+      strides[i] = strides[i - 1] * shape[i - 1];
+    return strides;
+  }
+
+  template <typename... Ix> void check_dimensions(Ix... index) const {
+    check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
+  }
+
+  void check_dimensions_impl(ssize_t, const ssize_t *) const {}
+
+  template <typename... Ix>
+  void check_dimensions_impl(ssize_t axis, const ssize_t *shape, ssize_t i,
+                             Ix... index) const {
+    if (i >= *shape) {
+      throw index_error(std::string("index ") + std::to_string(i) +
+                        " is out of bounds for axis " + std::to_string(axis) +
+                        " with size " + std::to_string(*shape));
     }
+    check_dimensions_impl(axis + 1, shape + 1, index...);
+  }
 
-    template<typename... Ix> ssize_t byte_offset(Ix... index) const {
-        check_dimensions(index...);
-        return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
-    }
-
-    void check_writeable() const {
-        if (!writeable())
-            throw std::domain_error("array is not writeable");
-    }
-
-    // Default, C-style strides
-    static std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
-        auto ndim = shape.size();
-        std::vector<ssize_t> strides(ndim, itemsize);
-        if (ndim > 0)
-            for (size_t i = ndim - 1; i > 0; --i)
-                strides[i - 1] = strides[i] * shape[i];
-        return strides;
-    }
-
-    // F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
-    static std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
-        auto ndim = shape.size();
-        std::vector<ssize_t> strides(ndim, itemsize);
-        for (size_t i = 1; i < ndim; ++i)
-            strides[i] = strides[i - 1] * shape[i - 1];
-        return strides;
-    }
-
-    template<typename... Ix> void check_dimensions(Ix... index) const {
-        check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
-    }
-
-    void check_dimensions_impl(ssize_t, const ssize_t*) const { }
-
-    template<typename... Ix> void check_dimensions_impl(ssize_t axis, const ssize_t* shape, ssize_t i, Ix... index) const {
-        if (i >= *shape) {
-            throw index_error(std::string("index ") + std::to_string(i) +
-                              " is out of bounds for axis " + std::to_string(axis) +
-                              " with size " + std::to_string(*shape));
-        }
-        check_dimensions_impl(axis + 1, shape + 1, index...);
-    }
-
-    /// Create array from any object -- always returns a new reference
-    static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
-        if (ptr == nullptr) {
-            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array from a nullptr");
-            return nullptr;
-        }
-        return detail::npy_api::get().PyArray_FromAny_(
-            ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+  /// Create array from any object -- always returns a new reference
+  static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
+    if (ptr == nullptr) {
+      PyErr_SetString(PyExc_ValueError,
+                      "cannot create a pybind11::array from a nullptr");
+      return nullptr;
     }
+    return detail::npy_api::get().PyArray_FromAny_(
+        ptr, nullptr, 0, 0,
+        detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+  }
 };
 
-template <typename T, int ExtraFlags = array::forcecast> class array_t : public array {
+template <typename T, int ExtraFlags = array::forcecast>
+class array_t : public array {
 private:
-    struct private_ctor {};
-    // Delegating constructor needed when both moving and accessing in the same constructor
-    array_t(private_ctor, ShapeContainer &&shape, StridesContainer &&strides, const T *ptr, handle base)
-        : array(std::move(shape), std::move(strides), ptr, base) {}
+  struct private_ctor {};
+  // Delegating constructor needed when both moving and accessing in the same
+  // constructor
+  array_t(private_ctor, ShapeContainer &&shape, StridesContainer &&strides,
+          const T *ptr, handle base)
+      : array(std::move(shape), std::move(strides), ptr, base) {}
+
 public:
-    static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
+  static_assert(!detail::array_info<T>::is_array,
+                "Array types cannot be used with array_t");
 
-    using value_type = T;
+  using value_type = T;
 
-    array_t() : array(0, static_cast<const T *>(nullptr)) {}
-    array_t(handle h, borrowed_t) : array(h, borrowed_t{}) { }
-    array_t(handle h, stolen_t) : array(h, stolen_t{}) { }
+  array_t() : array(0, static_cast<const T *>(nullptr)) {}
+  array_t(handle h, borrowed_t) : array(h, borrowed_t{}) {}
+  array_t(handle h, stolen_t) : array(h, stolen_t{}) {}
 
-    PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
-    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) {
-        if (!m_ptr) PyErr_Clear();
-        if (!is_borrowed) Py_XDECREF(h.ptr());
-    }
+  PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
+  array_t(handle h, bool is_borrowed)
+      : array(raw_array_t(h.ptr()), stolen_t{}) {
+    if (!m_ptr)
+      PyErr_Clear();
+    if (!is_borrowed)
+      Py_XDECREF(h.ptr());
+  }
 
-    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
-        if (!m_ptr) throw error_already_set();
-    }
+  array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
+    if (!m_ptr)
+      throw error_already_set();
+  }
 
-    explicit array_t(const buffer_info& info) : array(info) { }
+  explicit array_t(const buffer_info &info) : array(info) {}
 
-    array_t(ShapeContainer shape, StridesContainer strides, const T *ptr = nullptr, handle base = handle())
-        : array(std::move(shape), std::move(strides), ptr, base) { }
+  array_t(ShapeContainer shape, StridesContainer strides,
+          const T *ptr = nullptr, handle base = handle())
+      : array(std::move(shape), std::move(strides), ptr, base) {}
 
-    explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
-        : array_t(private_ctor{}, std::move(shape),
-                ExtraFlags & f_style ? f_strides(*shape, itemsize()) : c_strides(*shape, itemsize()),
-                ptr, base) { }
+  explicit array_t(ShapeContainer shape, const T *ptr = nullptr,
+                   handle base = handle())
+      : array_t(private_ctor{}, std::move(shape),
+                ExtraFlags & f_style ? f_strides(*shape, itemsize())
+                                     : c_strides(*shape, itemsize()),
+                ptr, base) {}
 
-    explicit array_t(size_t count, const T *ptr = nullptr, handle base = handle())
-        : array({count}, {}, ptr, base) { }
+  explicit array_t(size_t count, const T *ptr = nullptr, handle base = handle())
+      : array({count}, {}, ptr, base) {}
 
-    constexpr ssize_t itemsize() const {
-        return sizeof(T);
-    }
+  constexpr ssize_t itemsize() const { return sizeof(T); }
 
-    template<typename... Ix> ssize_t index_at(Ix... index) const {
-        return offset_at(index...) / itemsize();
-    }
+  template <typename... Ix> ssize_t index_at(Ix... index) const {
+    return offset_at(index...) / itemsize();
+  }
 
-    template<typename... Ix> const T* data(Ix... index) const {
-        return static_cast<const T*>(array::data(index...));
-    }
+  template <typename... Ix> const T *data(Ix... index) const {
+    return static_cast<const T *>(array::data(index...));
+  }
 
-    template<typename... Ix> T* mutable_data(Ix... index) {
-        return static_cast<T*>(array::mutable_data(index...));
-    }
+  template <typename... Ix> T *mutable_data(Ix... index) {
+    return static_cast<T *>(array::mutable_data(index...));
+  }
 
-    // Reference to element at a given index
-    template<typename... Ix> const T& at(Ix... index) const {
-        if ((ssize_t) sizeof...(index) != ndim())
-            fail_dim_check(sizeof...(index), "index dimension mismatch");
-        return *(static_cast<const T*>(array::data()) + byte_offset(ssize_t(index)...) / itemsize());
-    }
+  // Reference to element at a given index
+  template <typename... Ix> const T &at(Ix... index) const {
+    if ((ssize_t)sizeof...(index) != ndim())
+      fail_dim_check(sizeof...(index), "index dimension mismatch");
+    return *(static_cast<const T *>(array::data()) +
+             byte_offset(ssize_t(index)...) / itemsize());
+  }
 
-    // Mutable reference to element at a given index
-    template<typename... Ix> T& mutable_at(Ix... index) {
-        if ((ssize_t) sizeof...(index) != ndim())
-            fail_dim_check(sizeof...(index), "index dimension mismatch");
-        return *(static_cast<T*>(array::mutable_data()) + byte_offset(ssize_t(index)...) / itemsize());
-    }
+  // Mutable reference to element at a given index
+  template <typename... Ix> T &mutable_at(Ix... index) {
+    if ((ssize_t)sizeof...(index) != ndim())
+      fail_dim_check(sizeof...(index), "index dimension mismatch");
+    return *(static_cast<T *>(array::mutable_data()) +
+             byte_offset(ssize_t(index)...) / itemsize());
+  }
 
-    /**
-     * Returns a proxy object that provides access to the array's data without bounds or
-     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
-     * care: the array must not be destroyed or reshaped for the duration of the returned object,
-     * and the caller must take care not to access invalid dimensions or dimension indices.
-     */
-    template <ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
-        return array::mutable_unchecked<T, Dims>();
-    }
+  /**
+   * Returns a proxy object that provides access to the array's data without
+   * bounds or dimensionality checking.  Will throw if the array is missing the
+   * `writeable` flag.  Use with care: the array must not be destroyed or
+   * reshaped for the duration of the returned object, and the caller must take
+   * care not to access invalid dimensions or dimension indices.
+   */
+  template <ssize_t Dims = -1>
+  detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+    return array::mutable_unchecked<T, Dims>();
+  }
 
-    /**
-     * Returns a proxy object that provides const access to the array's data without bounds or
-     * dimensionality checking.  Unlike `unchecked()`, this does not require that the underlying
-     * array have the `writable` flag.  Use with care: the array must not be destroyed or reshaped
-     * for the duration of the returned object, and the caller must take care not to access invalid
-     * dimensions or dimension indices.
-     */
-    template <ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
-        return array::unchecked<T, Dims>();
-    }
+  /**
+   * Returns a proxy object that provides const access to the array's data
+   * without bounds or dimensionality checking.  Unlike `unchecked()`, this does
+   * not require that the underlying array have the `writable` flag.  Use with
+   * care: the array must not be destroyed or reshaped for the duration of the
+   * returned object, and the caller must take care not to access invalid
+   * dimensions or dimension indices.
+   */
+  template <ssize_t Dims = -1>
+  detail::unchecked_reference<T, Dims> unchecked() const & {
+    return array::unchecked<T, Dims>();
+  }
 
-    /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert
-    /// it).  In case of an error, nullptr is returned and the Python error is cleared.
-    static array_t ensure(handle h) {
-        auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
-        if (!result)
-            PyErr_Clear();
-        return result;
-    }
+  /// Ensure that the argument is a NumPy array of the correct dtype (and if
+  /// not, try to convert it).  In case of an error, nullptr is returned and the
+  /// Python error is cleared.
+  static array_t ensure(handle h) {
+    auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
+    if (!result)
+      PyErr_Clear();
+    return result;
+  }
 
-    static bool check_(handle h) {
-        const auto &api = detail::npy_api::get();
-        return api.PyArray_Check_(h.ptr())
-               && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr, dtype::of<T>().ptr());
-    }
+  static bool check_(handle h) {
+    const auto &api = detail::npy_api::get();
+    return api.PyArray_Check_(h.ptr()) &&
+           api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr,
+                                   dtype::of<T>().ptr());
+  }
 
 protected:
-    /// Create array from any object -- always returns a new reference
-    static PyObject *raw_array_t(PyObject *ptr) {
-        if (ptr == nullptr) {
-            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr");
-            return nullptr;
-        }
-        return detail::npy_api::get().PyArray_FromAny_(
-            ptr, dtype::of<T>().release().ptr(), 0, 0,
-            detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+  /// Create array from any object -- always returns a new reference
+  static PyObject *raw_array_t(PyObject *ptr) {
+    if (ptr == nullptr) {
+      PyErr_SetString(PyExc_ValueError,
+                      "cannot create a pybind11::array_t from a nullptr");
+      return nullptr;
     }
+    return detail::npy_api::get().PyArray_FromAny_(
+        ptr, dtype::of<T>().release().ptr(), 0, 0,
+        detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+  }
 };
 
 template <typename T>
-struct format_descriptor<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
-    static std::string format() {
-        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::format();
-    }
+struct format_descriptor<T,
+                         detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+  static std::string format() {
+    return detail::npy_format_descriptor<
+        typename std::remove_cv<T>::type>::format();
+  }
 };
 
 template <size_t N> struct format_descriptor<char[N]> {
-    static std::string format() { return std::to_string(N) + "s"; }
+  static std::string format() { return std::to_string(N) + "s"; }
 };
 template <size_t N> struct format_descriptor<std::array<char, N>> {
-    static std::string format() { return std::to_string(N) + "s"; }
+  static std::string format() { return std::to_string(N) + "s"; }
 };
 
 template <typename T>
 struct format_descriptor<T, detail::enable_if_t<std::is_enum<T>::value>> {
-    static std::string format() {
-        return format_descriptor<
-            typename std::remove_cv<typename std::underlying_type<T>::type>::type>::format();
-    }
+  static std::string format() {
+    return format_descriptor<typename std::remove_cv<
+        typename std::underlying_type<T>::type>::type>::format();
+  }
 };
 
 template <typename T>
-struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
-    static std::string format() {
-        using namespace detail;
-        static constexpr auto extents = _("(") + array_info<T>::extents + _(")");
-        return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
-    }
+struct format_descriptor<T,
+                         detail::enable_if_t<detail::array_info<T>::is_array>> {
+  static std::string format() {
+    using namespace detail;
+    static constexpr auto extents = _("(") + array_info<T>::extents + _(")");
+    return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
+  }
 };
 
 NAMESPACE_BEGIN(detail)
 template <typename T, int ExtraFlags>
 struct pyobject_caster<array_t<T, ExtraFlags>> {
-    using type = array_t<T, ExtraFlags>;
+  using type = array_t<T, ExtraFlags>;
 
-    bool load(handle src, bool convert) {
-        if (!convert && !type::check_(src))
-            return false;
-        value = type::ensure(src);
-        return static_cast<bool>(value);
-    }
+  bool load(handle src, bool convert) {
+    if (!convert && !type::check_(src))
+      return false;
+    value = type::ensure(src);
+    return static_cast<bool>(value);
+  }
 
-    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
-        return src.inc_ref();
-    }
-    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+  static handle cast(const handle &src, return_value_policy /* policy */,
+                     handle /* parent */) {
+    return src.inc_ref();
+  }
+  PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
 };
 
 template <typename T>
-struct compare_buffer_info<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
-    static bool compare(const buffer_info& b) {
-        return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(), dtype(b).ptr());
-    }
+struct compare_buffer_info<
+    T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+  static bool compare(const buffer_info &b) {
+    return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(),
+                                              dtype(b).ptr());
+  }
 };
 
-template <typename T, typename = void>
-struct npy_format_descriptor_name;
+template <typename T, typename = void> struct npy_format_descriptor_name;
 
 template <typename T>
 struct npy_format_descriptor_name<T, enable_if_t<std::is_integral<T>::value>> {
-    static constexpr auto name = _<std::is_same<T, bool>::value>(
-        _("bool"), _<std::is_signed<T>::value>("int", "uint") + _<sizeof(T)*8>()
-    );
+  static constexpr auto name = _<std::is_same<T, bool>::value>(
+      _("bool"),
+      _<std::is_signed<T>::value>("int", "uint") + _<sizeof(T) * 8>());
 };
 
 template <typename T>
-struct npy_format_descriptor_name<T, enable_if_t<std::is_floating_point<T>::value>> {
-    static constexpr auto name = _<std::is_same<T, float>::value || std::is_same<T, double>::value>(
-        _("float") + _<sizeof(T)*8>(), _("longdouble")
-    );
+struct npy_format_descriptor_name<
+    T, enable_if_t<std::is_floating_point<T>::value>> {
+  static constexpr auto name = _ < std::is_same<T, float>::value ||
+                               std::is_same<T, double>::value >
+                                   (_("float") + _<sizeof(T) * 8>(),
+                                    _("longdouble"));
 };
 
 template <typename T>
 struct npy_format_descriptor_name<T, enable_if_t<is_complex<T>::value>> {
-    static constexpr auto name = _<std::is_same<typename T::value_type, float>::value
-                                   || std::is_same<typename T::value_type, double>::value>(
-        _("complex") + _<sizeof(typename T::value_type)*16>(), _("longcomplex")
-    );
+  static constexpr auto
+      name = _ < std::is_same<typename T::value_type, float>::value ||
+             std::is_same<typename T::value_type, double>::value >
+                 (_("complex") + _<sizeof(typename T::value_type) * 16>(),
+                  _("longcomplex"));
 };
 
 template <typename T>
-struct npy_format_descriptor<T, enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>>
+struct npy_format_descriptor<
+    T, enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>>
     : npy_format_descriptor_name<T> {
 private:
-    // NB: the order here must match the one in common.h
-    constexpr static const int values[15] = {
-        npy_api::NPY_BOOL_,
-        npy_api::NPY_BYTE_,   npy_api::NPY_UBYTE_,   npy_api::NPY_SHORT_,    npy_api::NPY_USHORT_,
-        npy_api::NPY_INT_,    npy_api::NPY_UINT_,    npy_api::NPY_LONGLONG_, npy_api::NPY_ULONGLONG_,
-        npy_api::NPY_FLOAT_,  npy_api::NPY_DOUBLE_,  npy_api::NPY_LONGDOUBLE_,
-        npy_api::NPY_CFLOAT_, npy_api::NPY_CDOUBLE_, npy_api::NPY_CLONGDOUBLE_
-    };
+  // NB: the order here must match the one in common.h
+  constexpr static const int values[15] = {
+      npy_api::NPY_BOOL_,   npy_api::NPY_BYTE_,     npy_api::NPY_UBYTE_,
+      npy_api::NPY_SHORT_,  npy_api::NPY_USHORT_,   npy_api::NPY_INT_,
+      npy_api::NPY_UINT_,   npy_api::NPY_LONGLONG_, npy_api::NPY_ULONGLONG_,
+      npy_api::NPY_FLOAT_,  npy_api::NPY_DOUBLE_,   npy_api::NPY_LONGDOUBLE_,
+      npy_api::NPY_CFLOAT_, npy_api::NPY_CDOUBLE_,  npy_api::NPY_CLONGDOUBLE_};
 
 public:
-    static constexpr int value = values[detail::is_fmt_numeric<T>::index];
+  static constexpr int value = values[detail::is_fmt_numeric<T>::index];
 
-    static pybind11::dtype dtype() {
-        if (auto ptr = npy_api::get().PyArray_DescrFromType_(value))
-            return reinterpret_borrow<pybind11::dtype>(ptr);
-        pybind11_fail("Unsupported buffer format!");
-    }
+  static pybind11::dtype dtype() {
+    if (auto ptr = npy_api::get().PyArray_DescrFromType_(value))
+      return reinterpret_borrow<pybind11::dtype>(ptr);
+    pybind11_fail("Unsupported buffer format!");
+  }
 };
 
-#define PYBIND11_DECL_CHAR_FMT \
-    static constexpr auto name = _("S") + _<N>(); \
-    static pybind11::dtype dtype() { return pybind11::dtype(std::string("S") + std::to_string(N)); }
-template <size_t N> struct npy_format_descriptor<char[N]> { PYBIND11_DECL_CHAR_FMT };
-template <size_t N> struct npy_format_descriptor<std::array<char, N>> { PYBIND11_DECL_CHAR_FMT };
+#define PYBIND11_DECL_CHAR_FMT                                                 \
+  static constexpr auto name = _("S") + _<N>();                                \
+  static pybind11::dtype dtype() {                                             \
+    return pybind11::dtype(std::string("S") + std::to_string(N));              \
+  }
+template <size_t N> struct npy_format_descriptor<char[N]> {
+  PYBIND11_DECL_CHAR_FMT
+};
+template <size_t N> struct npy_format_descriptor<std::array<char, N>> {
+  PYBIND11_DECL_CHAR_FMT
+};
 #undef PYBIND11_DECL_CHAR_FMT
 
-template<typename T> struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
 private:
-    using base_descr = npy_format_descriptor<typename array_info<T>::type>;
-public:
-    static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
+  using base_descr = npy_format_descriptor<typename array_info<T>::type>;
 
-    static constexpr auto name = _("(") + array_info<T>::extents + _(")") + base_descr::name;
-    static pybind11::dtype dtype() {
-        list shape;
-        array_info<T>::append_extents(shape);
-        return pybind11::dtype::from_args(pybind11::make_tuple(base_descr::dtype(), shape));
-    }
+public:
+  static_assert(!array_info<T>::is_empty,
+                "Zero-sized arrays are not supported");
+
+  static constexpr auto name =
+      _("(") + array_info<T>::extents + _(")") + base_descr::name;
+  static pybind11::dtype dtype() {
+    list shape;
+    array_info<T>::append_extents(shape);
+    return pybind11::dtype::from_args(
+        pybind11::make_tuple(base_descr::dtype(), shape));
+  }
 };
 
-template<typename T> struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
 private:
-    using base_descr = npy_format_descriptor<typename std::underlying_type<T>::type>;
+  using base_descr =
+      npy_format_descriptor<typename std::underlying_type<T>::type>;
+
 public:
-    static constexpr auto name = base_descr::name;
-    static pybind11::dtype dtype() { return base_descr::dtype(); }
+  static constexpr auto name = base_descr::name;
+  static pybind11::dtype dtype() { return base_descr::dtype(); }
 };
 
 struct field_descriptor {
-    const char *name;
-    ssize_t offset;
-    ssize_t size;
-    std::string format;
-    dtype descr;
+  const char *name;
+  ssize_t offset;
+  ssize_t size;
+  std::string format;
+  dtype descr;
 };
 
-inline PYBIND11_NOINLINE void register_structured_dtype(
-    any_container<field_descriptor> fields,
-    const std::type_info& tinfo, ssize_t itemsize,
-    bool (*direct_converter)(PyObject *, void *&)) {
+inline PYBIND11_NOINLINE void
+register_structured_dtype(any_container<field_descriptor> fields,
+                          const std::type_info &tinfo, ssize_t itemsize,
+                          bool (*direct_converter)(PyObject *, void *&)) {
 
-    auto& numpy_internals = get_numpy_internals();
-    if (numpy_internals.get_type_info(tinfo, false))
-        pybind11_fail("NumPy: dtype is already registered");
+  auto &numpy_internals = get_numpy_internals();
+  if (numpy_internals.get_type_info(tinfo, false))
+    pybind11_fail("NumPy: dtype is already registered");
 
-    list names, formats, offsets;
-    for (auto field : *fields) {
-        if (!field.descr)
-            pybind11_fail(std::string("NumPy: unsupported field dtype: `") +
-                            field.name + "` @ " + tinfo.name());
-        names.append(PYBIND11_STR_TYPE(field.name));
-        formats.append(field.descr);
-        offsets.append(pybind11::int_(field.offset));
-    }
-    auto dtype_ptr = pybind11::dtype(names, formats, offsets, itemsize).release().ptr();
+  list names, formats, offsets;
+  for (auto field : *fields) {
+    if (!field.descr)
+      pybind11_fail(std::string("NumPy: unsupported field dtype: `") +
+                    field.name + "` @ " + tinfo.name());
+    names.append(PYBIND11_STR_TYPE(field.name));
+    formats.append(field.descr);
+    offsets.append(pybind11::int_(field.offset));
+  }
+  auto dtype_ptr =
+      pybind11::dtype(names, formats, offsets, itemsize).release().ptr();
 
-    // There is an existing bug in NumPy (as of v1.11): trailing bytes are
-    // not encoded explicitly into the format string. This will supposedly
-    // get fixed in v1.12; for further details, see these:
-    // - https://github.com/numpy/numpy/issues/7797
-    // - https://github.com/numpy/numpy/pull/7798
-    // Because of this, we won't use numpy's logic to generate buffer format
-    // strings and will just do it ourselves.
-    std::vector<field_descriptor> ordered_fields(std::move(fields));
-    std::sort(ordered_fields.begin(), ordered_fields.end(),
-        [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; });
-    ssize_t offset = 0;
-    std::ostringstream oss;
-    // mark the structure as unaligned with '^', because numpy and C++ don't
-    // always agree about alignment (particularly for complex), and we're
-    // explicitly listing all our padding. This depends on none of the fields
-    // overriding the endianness. Putting the ^ in front of individual fields
-    // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
-    oss << "^T{";
-    for (auto& field : ordered_fields) {
-        if (field.offset > offset)
-            oss << (field.offset - offset) << 'x';
-        oss << field.format << ':' << field.name << ':';
-        offset = field.offset + field.size;
-    }
-    if (itemsize > offset)
-        oss << (itemsize - offset) << 'x';
-    oss << '}';
-    auto format_str = oss.str();
+  // There is an existing bug in NumPy (as of v1.11): trailing bytes are
+  // not encoded explicitly into the format string. This will supposedly
+  // get fixed in v1.12; for further details, see these:
+  // - https://github.com/numpy/numpy/issues/7797
+  // - https://github.com/numpy/numpy/pull/7798
+  // Because of this, we won't use numpy's logic to generate buffer format
+  // strings and will just do it ourselves.
+  std::vector<field_descriptor> ordered_fields(std::move(fields));
+  std::sort(ordered_fields.begin(), ordered_fields.end(),
+            [](const field_descriptor &a, const field_descriptor &b) {
+              return a.offset < b.offset;
+            });
+  ssize_t offset = 0;
+  std::ostringstream oss;
+  // mark the structure as unaligned with '^', because numpy and C++ don't
+  // always agree about alignment (particularly for complex), and we're
+  // explicitly listing all our padding. This depends on none of the fields
+  // overriding the endianness. Putting the ^ in front of individual fields
+  // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
+  oss << "^T{";
+  for (auto &field : ordered_fields) {
+    if (field.offset > offset)
+      oss << (field.offset - offset) << 'x';
+    oss << field.format << ':' << field.name << ':';
+    offset = field.offset + field.size;
+  }
+  if (itemsize > offset)
+    oss << (itemsize - offset) << 'x';
+  oss << '}';
+  auto format_str = oss.str();
 
-    // Sanity check: verify that NumPy properly parses our buffer format string
-    auto& api = npy_api::get();
-    auto arr =  array(buffer_info(nullptr, itemsize, format_str, 1));
-    if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr()))
-        pybind11_fail("NumPy: invalid buffer descriptor!");
+  // Sanity check: verify that NumPy properly parses our buffer format string
+  auto &api = npy_api::get();
+  auto arr = array(buffer_info(nullptr, itemsize, format_str, 1));
+  if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr()))
+    pybind11_fail("NumPy: invalid buffer descriptor!");
 
-    auto tindex = std::type_index(tinfo);
-    numpy_internals.registered_dtypes[tindex] = { dtype_ptr, format_str };
-    get_internals().direct_conversions[tindex].push_back(direct_converter);
+  auto tindex = std::type_index(tinfo);
+  numpy_internals.registered_dtypes[tindex] = {dtype_ptr, format_str};
+  get_internals().direct_conversions[tindex].push_back(direct_converter);
 }
 
 template <typename T, typename SFINAE> struct npy_format_descriptor {
-    static_assert(is_pod_struct<T>::value, "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
+  static_assert(
+      is_pod_struct<T>::value,
+      "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
 
-    static constexpr auto name = make_caster<T>::name;
+  static constexpr auto name = make_caster<T>::name;
 
-    static pybind11::dtype dtype() {
-        return reinterpret_borrow<pybind11::dtype>(dtype_ptr());
-    }
+  static pybind11::dtype dtype() {
+    return reinterpret_borrow<pybind11::dtype>(dtype_ptr());
+  }
 
-    static std::string format() {
-        static auto format_str = get_numpy_internals().get_type_info<T>(true)->format_str;
-        return format_str;
-    }
+  static std::string format() {
+    static auto format_str =
+        get_numpy_internals().get_type_info<T>(true)->format_str;
+    return format_str;
+  }
 
-    static void register_dtype(any_container<field_descriptor> fields) {
-        register_structured_dtype(std::move(fields), typeid(typename std::remove_cv<T>::type),
-                                  sizeof(T), &direct_converter);
-    }
+  static void register_dtype(any_container<field_descriptor> fields) {
+    register_structured_dtype(std::move(fields),
+                              typeid(typename std::remove_cv<T>::type),
+                              sizeof(T), &direct_converter);
+  }
 
 private:
-    static PyObject* dtype_ptr() {
-        static PyObject* ptr = get_numpy_internals().get_type_info<T>(true)->dtype_ptr;
-        return ptr;
-    }
+  static PyObject *dtype_ptr() {
+    static PyObject *ptr =
+        get_numpy_internals().get_type_info<T>(true)->dtype_ptr;
+    return ptr;
+  }
 
-    static bool direct_converter(PyObject *obj, void*& value) {
-        auto& api = npy_api::get();
-        if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_))
-            return false;
-        if (auto descr = reinterpret_steal<object>(api.PyArray_DescrFromScalar_(obj))) {
-            if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) {
-                value = ((PyVoidScalarObject_Proxy *) obj)->obval;
-                return true;
-            }
-        }
-        return false;
+  static bool direct_converter(PyObject *obj, void *&value) {
+    auto &api = npy_api::get();
+    if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_))
+      return false;
+    if (auto descr =
+            reinterpret_steal<object>(api.PyArray_DescrFromScalar_(obj))) {
+      if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) {
+        value = ((PyVoidScalarObject_Proxy *)obj)->obval;
+        return true;
+      }
     }
+    return false;
+  }
 };
 
-#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code)
-# define PYBIND11_NUMPY_DTYPE(Type, ...) ((void)0)
-# define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void)0)
+#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't
+                     // affect code)
+#define PYBIND11_NUMPY_DTYPE(Type, ...) ((void)0)
+#define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void)0)
 #else
 
-#define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                                          \
-    ::pybind11::detail::field_descriptor {                                                    \
-        Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),                  \
-        ::pybind11::format_descriptor<decltype(std::declval<T>().Field)>::format(),           \
-        ::pybind11::detail::npy_format_descriptor<decltype(std::declval<T>().Field)>::dtype() \
-    }
+#define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                           \
+  ::pybind11::detail::field_descriptor {                                       \
+    Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),       \
+        ::pybind11::format_descriptor<decltype(                                \
+            std::declval<T>().Field)>::format(),                               \
+        ::pybind11::detail::npy_format_descriptor<decltype(                    \
+            std::declval<T>().Field)>::dtype()                                 \
+  }
 
 // Extract name, offset and format descriptor for a struct field
-#define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field)
+#define PYBIND11_FIELD_DESCRIPTOR(T, Field)                                    \
+  PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field)
 
-// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro
-// (C) William Swanson, Paul Fultz
+// The main idea of this macro is borrowed from
+// https://github.com/swansontec/map-macro (C) William Swanson, Paul Fultz
 #define PYBIND11_EVAL0(...) __VA_ARGS__
-#define PYBIND11_EVAL1(...) PYBIND11_EVAL0 (PYBIND11_EVAL0 (PYBIND11_EVAL0 (__VA_ARGS__)))
-#define PYBIND11_EVAL2(...) PYBIND11_EVAL1 (PYBIND11_EVAL1 (PYBIND11_EVAL1 (__VA_ARGS__)))
-#define PYBIND11_EVAL3(...) PYBIND11_EVAL2 (PYBIND11_EVAL2 (PYBIND11_EVAL2 (__VA_ARGS__)))
-#define PYBIND11_EVAL4(...) PYBIND11_EVAL3 (PYBIND11_EVAL3 (PYBIND11_EVAL3 (__VA_ARGS__)))
-#define PYBIND11_EVAL(...)  PYBIND11_EVAL4 (PYBIND11_EVAL4 (PYBIND11_EVAL4 (__VA_ARGS__)))
+#define PYBIND11_EVAL1(...)                                                    \
+  PYBIND11_EVAL0(PYBIND11_EVAL0(PYBIND11_EVAL0(__VA_ARGS__)))
+#define PYBIND11_EVAL2(...)                                                    \
+  PYBIND11_EVAL1(PYBIND11_EVAL1(PYBIND11_EVAL1(__VA_ARGS__)))
+#define PYBIND11_EVAL3(...)                                                    \
+  PYBIND11_EVAL2(PYBIND11_EVAL2(PYBIND11_EVAL2(__VA_ARGS__)))
+#define PYBIND11_EVAL4(...)                                                    \
+  PYBIND11_EVAL3(PYBIND11_EVAL3(PYBIND11_EVAL3(__VA_ARGS__)))
+#define PYBIND11_EVAL(...)                                                     \
+  PYBIND11_EVAL4(PYBIND11_EVAL4(PYBIND11_EVAL4(__VA_ARGS__)))
 #define PYBIND11_MAP_END(...)
 #define PYBIND11_MAP_OUT
 #define PYBIND11_MAP_COMMA ,
 #define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END
 #define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT
-#define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0 (test, next, 0)
-#define PYBIND11_MAP_NEXT(test, next)  PYBIND11_MAP_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0(test, next, 0)
+#define PYBIND11_MAP_NEXT(test, next)                                          \
+  PYBIND11_MAP_NEXT1(PYBIND11_MAP_GET_END test, next)
 #ifdef _MSC_VER // MSVC is not as eager to expand macros, hence this workaround
-#define PYBIND11_MAP_LIST_NEXT1(test, next) \
-    PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0))
+#define PYBIND11_MAP_LIST_NEXT1(test, next)                                    \
+  PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0))
 #else
-#define PYBIND11_MAP_LIST_NEXT1(test, next) \
-    PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)
+#define PYBIND11_MAP_LIST_NEXT1(test, next)                                    \
+  PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)
 #endif
-#define PYBIND11_MAP_LIST_NEXT(test, next) \
-    PYBIND11_MAP_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next)
-#define PYBIND11_MAP_LIST0(f, t, x, peek, ...) \
-    f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST1) (f, t, peek, __VA_ARGS__)
-#define PYBIND11_MAP_LIST1(f, t, x, peek, ...) \
-    f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST0) (f, t, peek, __VA_ARGS__)
+#define PYBIND11_MAP_LIST_NEXT(test, next)                                     \
+  PYBIND11_MAP_LIST_NEXT1(PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP_LIST0(f, t, x, peek, ...)                                 \
+  f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST1)(f, t, peek,         \
+                                                           __VA_ARGS__)
+#define PYBIND11_MAP_LIST1(f, t, x, peek, ...)                                 \
+  f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST0)(f, t, peek,         \
+                                                           __VA_ARGS__)
 // PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ...
-#define PYBIND11_MAP_LIST(f, t, ...) \
-    PYBIND11_EVAL (PYBIND11_MAP_LIST1 (f, t, __VA_ARGS__, (), 0))
+#define PYBIND11_MAP_LIST(f, t, ...)                                           \
+  PYBIND11_EVAL(PYBIND11_MAP_LIST1(f, t, __VA_ARGS__, (), 0))
 
-#define PYBIND11_NUMPY_DTYPE(Type, ...) \
-    ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
-        (::std::vector<::pybind11::detail::field_descriptor> \
-         {PYBIND11_MAP_LIST (PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)})
+#define PYBIND11_NUMPY_DTYPE(Type, ...)                                        \
+  ::pybind11::detail::npy_format_descriptor<Type>::register_dtype(             \
+      ::std::vector<::pybind11::detail::field_descriptor>{                     \
+          PYBIND11_MAP_LIST(PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)})
 
 #ifdef _MSC_VER
-#define PYBIND11_MAP2_LIST_NEXT1(test, next) \
-    PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0))
+#define PYBIND11_MAP2_LIST_NEXT1(test, next)                                   \
+  PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0))
 #else
-#define PYBIND11_MAP2_LIST_NEXT1(test, next) \
-    PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)
+#define PYBIND11_MAP2_LIST_NEXT1(test, next)                                   \
+  PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)
 #endif
-#define PYBIND11_MAP2_LIST_NEXT(test, next) \
-    PYBIND11_MAP2_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next)
-#define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...) \
-    f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST1) (f, t, peek, __VA_ARGS__)
-#define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...) \
-    f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST0) (f, t, peek, __VA_ARGS__)
-// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ...
-#define PYBIND11_MAP2_LIST(f, t, ...) \
-    PYBIND11_EVAL (PYBIND11_MAP2_LIST1 (f, t, __VA_ARGS__, (), 0))
+#define PYBIND11_MAP2_LIST_NEXT(test, next)                                    \
+  PYBIND11_MAP2_LIST_NEXT1(PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...)                           \
+  f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST1)(f, t, peek,  \
+                                                                  __VA_ARGS__)
+#define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...)                           \
+  f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST0)(f, t, peek,  \
+                                                                  __VA_ARGS__)
+// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4),
+// ...
+#define PYBIND11_MAP2_LIST(f, t, ...)                                          \
+  PYBIND11_EVAL(PYBIND11_MAP2_LIST1(f, t, __VA_ARGS__, (), 0))
 
-#define PYBIND11_NUMPY_DTYPE_EX(Type, ...) \
-    ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
-        (::std::vector<::pybind11::detail::field_descriptor> \
-         {PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
+#define PYBIND11_NUMPY_DTYPE_EX(Type, ...)                                     \
+  ::pybind11::detail::npy_format_descriptor<Type>::register_dtype(             \
+      ::std::vector<::pybind11::detail::field_descriptor>{PYBIND11_MAP2_LIST(  \
+          PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
 
 #endif // __CLION_IDE__
 
-template  <class T>
-using array_iterator = typename std::add_pointer<T>::type;
+template <class T> using array_iterator = typename std::add_pointer<T>::type;
 
-template <class T>
-array_iterator<T> array_begin(const buffer_info& buffer) {
-    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr));
+template <class T> array_iterator<T> array_begin(const buffer_info &buffer) {
+  return array_iterator<T>(reinterpret_cast<T *>(buffer.ptr));
 }
 
-template <class T>
-array_iterator<T> array_end(const buffer_info& buffer) {
-    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr) + buffer.size);
+template <class T> array_iterator<T> array_end(const buffer_info &buffer) {
+  return array_iterator<T>(reinterpret_cast<T *>(buffer.ptr) + buffer.size);
 }
 
 class common_iterator {
 public:
-    using container_type = std::vector<ssize_t>;
-    using value_type = container_type::value_type;
-    using size_type = container_type::size_type;
+  using container_type = std::vector<ssize_t>;
+  using value_type = container_type::value_type;
+  using size_type = container_type::size_type;
 
-    common_iterator() : p_ptr(0), m_strides() {}
+  common_iterator() : p_ptr(0), m_strides() {}
 
-    common_iterator(void* ptr, const container_type& strides, const container_type& shape)
-        : p_ptr(reinterpret_cast<char*>(ptr)), m_strides(strides.size()) {
-        m_strides.back() = static_cast<value_type>(strides.back());
-        for (size_type i = m_strides.size() - 1; i != 0; --i) {
-            size_type j = i - 1;
-            value_type s = static_cast<value_type>(shape[i]);
-            m_strides[j] = strides[j] + m_strides[i] - strides[i] * s;
-        }
+  common_iterator(void *ptr, const container_type &strides,
+                  const container_type &shape)
+      : p_ptr(reinterpret_cast<char *>(ptr)), m_strides(strides.size()) {
+    m_strides.back() = static_cast<value_type>(strides.back());
+    for (size_type i = m_strides.size() - 1; i != 0; --i) {
+      size_type j = i - 1;
+      value_type s = static_cast<value_type>(shape[i]);
+      m_strides[j] = strides[j] + m_strides[i] - strides[i] * s;
     }
+  }
 
-    void increment(size_type dim) {
-        p_ptr += m_strides[dim];
-    }
+  void increment(size_type dim) { p_ptr += m_strides[dim]; }
 
-    void* data() const {
-        return p_ptr;
-    }
+  void *data() const { return p_ptr; }
 
 private:
-    char* p_ptr;
-    container_type m_strides;
+  char *p_ptr;
+  container_type m_strides;
 };
 
 template <size_t N> class multi_array_iterator {
 public:
-    using container_type = std::vector<ssize_t>;
+  using container_type = std::vector<ssize_t>;
 
-    multi_array_iterator(const std::array<buffer_info, N> &buffers,
-                         const container_type &shape)
-        : m_shape(shape.size()), m_index(shape.size(), 0),
-          m_common_iterator() {
+  multi_array_iterator(const std::array<buffer_info, N> &buffers,
+                       const container_type &shape)
+      : m_shape(shape.size()), m_index(shape.size(), 0), m_common_iterator() {
 
-        // Manual copy to avoid conversion warning if using std::copy
-        for (size_t i = 0; i < shape.size(); ++i)
-            m_shape[i] = shape[i];
+    // Manual copy to avoid conversion warning if using std::copy
+    for (size_t i = 0; i < shape.size(); ++i)
+      m_shape[i] = shape[i];
 
-        container_type strides(shape.size());
-        for (size_t i = 0; i < N; ++i)
-            init_common_iterator(buffers[i], shape, m_common_iterator[i], strides);
+    container_type strides(shape.size());
+    for (size_t i = 0; i < N; ++i)
+      init_common_iterator(buffers[i], shape, m_common_iterator[i], strides);
+  }
+
+  multi_array_iterator &operator++() {
+    for (size_t j = m_index.size(); j != 0; --j) {
+      size_t i = j - 1;
+      if (++m_index[i] != m_shape[i]) {
+        increment_common_iterator(i);
+        break;
+      } else {
+        m_index[i] = 0;
+      }
     }
+    return *this;
+  }
 
-    multi_array_iterator& operator++() {
-        for (size_t j = m_index.size(); j != 0; --j) {
-            size_t i = j - 1;
-            if (++m_index[i] != m_shape[i]) {
-                increment_common_iterator(i);
-                break;
-            } else {
-                m_index[i] = 0;
-            }
-        }
-        return *this;
-    }
-
-    template <size_t K, class T = void> T* data() const {
-        return reinterpret_cast<T*>(m_common_iterator[K].data());
-    }
+  template <size_t K, class T = void> T *data() const {
+    return reinterpret_cast<T *>(m_common_iterator[K].data());
+  }
 
 private:
+  using common_iter = common_iterator;
 
-    using common_iter = common_iterator;
+  void init_common_iterator(const buffer_info &buffer,
+                            const container_type &shape, common_iter &iterator,
+                            container_type &strides) {
+    auto buffer_shape_iter = buffer.shape.rbegin();
+    auto buffer_strides_iter = buffer.strides.rbegin();
+    auto shape_iter = shape.rbegin();
+    auto strides_iter = strides.rbegin();
 
-    void init_common_iterator(const buffer_info &buffer,
-                              const container_type &shape,
-                              common_iter &iterator,
-                              container_type &strides) {
-        auto buffer_shape_iter = buffer.shape.rbegin();
-        auto buffer_strides_iter = buffer.strides.rbegin();
-        auto shape_iter = shape.rbegin();
-        auto strides_iter = strides.rbegin();
+    while (buffer_shape_iter != buffer.shape.rend()) {
+      if (*shape_iter == *buffer_shape_iter)
+        *strides_iter = *buffer_strides_iter;
+      else
+        *strides_iter = 0;
 
-        while (buffer_shape_iter != buffer.shape.rend()) {
-            if (*shape_iter == *buffer_shape_iter)
-                *strides_iter = *buffer_strides_iter;
-            else
-                *strides_iter = 0;
-
-            ++buffer_shape_iter;
-            ++buffer_strides_iter;
-            ++shape_iter;
-            ++strides_iter;
-        }
-
-        std::fill(strides_iter, strides.rend(), 0);
-        iterator = common_iter(buffer.ptr, strides, shape);
+      ++buffer_shape_iter;
+      ++buffer_strides_iter;
+      ++shape_iter;
+      ++strides_iter;
     }
 
-    void increment_common_iterator(size_t dim) {
-        for (auto &iter : m_common_iterator)
-            iter.increment(dim);
-    }
+    std::fill(strides_iter, strides.rend(), 0);
+    iterator = common_iter(buffer.ptr, strides, shape);
+  }
 
-    container_type m_shape;
-    container_type m_index;
-    std::array<common_iter, N> m_common_iterator;
+  void increment_common_iterator(size_t dim) {
+    for (auto &iter : m_common_iterator)
+      iter.increment(dim);
+  }
+
+  container_type m_shape;
+  container_type m_index;
+  std::array<common_iter, N> m_common_iterator;
 };
 
 enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
 
-// Populates the shape and number of dimensions for the set of buffers.  Returns a broadcast_trivial
-// enum value indicating whether the broadcast is "trivial"--that is, has each buffer being either a
-// singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous (`f_trivial`) storage
+// Populates the shape and number of dimensions for the set of buffers.  Returns
+// a broadcast_trivial enum value indicating whether the broadcast is
+// "trivial"--that is, has each buffer being either a singleton or a full-size,
+// C-contiguous (`c_trivial`) or Fortran-contiguous (`f_trivial`) storage
 // buffer; returns `non_trivial` otherwise.
 template <size_t N>
-broadcast_trivial broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
-    ndim = std::accumulate(buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
-        return std::max(res, buf.ndim);
-    });
+broadcast_trivial broadcast(const std::array<buffer_info, N> &buffers,
+                            ssize_t &ndim, std::vector<ssize_t> &shape) {
+  ndim = std::accumulate(buffers.begin(), buffers.end(), ssize_t(0),
+                         [](ssize_t res, const buffer_info &buf) {
+                           return std::max(res, buf.ndim);
+                         });
 
-    shape.clear();
-    shape.resize((size_t) ndim, 1);
+  shape.clear();
+  shape.resize((size_t)ndim, 1);
 
-    // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1 or
-    // the full size).
-    for (size_t i = 0; i < N; ++i) {
-        auto res_iter = shape.rbegin();
-        auto end = buffers[i].shape.rend();
-        for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end; ++shape_iter, ++res_iter) {
-            const auto &dim_size_in = *shape_iter;
-            auto &dim_size_out = *res_iter;
+  // Figure out the output size, and make sure all input arrays conform (i.e.
+  // are either size 1 or the full size).
+  for (size_t i = 0; i < N; ++i) {
+    auto res_iter = shape.rbegin();
+    auto end = buffers[i].shape.rend();
+    for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end;
+         ++shape_iter, ++res_iter) {
+      const auto &dim_size_in = *shape_iter;
+      auto &dim_size_out = *res_iter;
 
-            // Each input dimension can either be 1 or `n`, but `n` values must match across buffers
-            if (dim_size_out == 1)
-                dim_size_out = dim_size_in;
-            else if (dim_size_in != 1 && dim_size_in != dim_size_out)
-                pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!");
-        }
+      // Each input dimension can either be 1 or `n`, but `n` values must match
+      // across buffers
+      if (dim_size_out == 1)
+        dim_size_out = dim_size_in;
+      else if (dim_size_in != 1 && dim_size_in != dim_size_out)
+        pybind11_fail(
+            "pybind11::vectorize: incompatible size/dimension of inputs!");
+    }
+  }
+
+  bool trivial_broadcast_c = true;
+  bool trivial_broadcast_f = true;
+  for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f);
+       ++i) {
+    if (buffers[i].size == 1)
+      continue;
+
+    // Require the same number of dimensions:
+    if (buffers[i].ndim != ndim)
+      return broadcast_trivial::non_trivial;
+
+    // Require all dimensions be full-size:
+    if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(),
+                    shape.cbegin()))
+      return broadcast_trivial::non_trivial;
+
+    // Check for C contiguity (but only if previous inputs were also C
+    // contiguous)
+    if (trivial_broadcast_c) {
+      ssize_t expect_stride = buffers[i].itemsize;
+      auto end = buffers[i].shape.crend();
+      for (auto shape_iter = buffers[i].shape.crbegin(),
+                stride_iter = buffers[i].strides.crbegin();
+           trivial_broadcast_c && shape_iter != end;
+           ++shape_iter, ++stride_iter) {
+        if (expect_stride == *stride_iter)
+          expect_stride *= *shape_iter;
+        else
+          trivial_broadcast_c = false;
+      }
     }
 
-    bool trivial_broadcast_c = true;
-    bool trivial_broadcast_f = true;
-    for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) {
-        if (buffers[i].size == 1)
-            continue;
-
-        // Require the same number of dimensions:
-        if (buffers[i].ndim != ndim)
-            return broadcast_trivial::non_trivial;
-
-        // Require all dimensions be full-size:
-        if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin()))
-            return broadcast_trivial::non_trivial;
-
-        // Check for C contiguity (but only if previous inputs were also C contiguous)
-        if (trivial_broadcast_c) {
-            ssize_t expect_stride = buffers[i].itemsize;
-            auto end = buffers[i].shape.crend();
-            for (auto shape_iter = buffers[i].shape.crbegin(), stride_iter = buffers[i].strides.crbegin();
-                    trivial_broadcast_c && shape_iter != end; ++shape_iter, ++stride_iter) {
-                if (expect_stride == *stride_iter)
-                    expect_stride *= *shape_iter;
-                else
-                    trivial_broadcast_c = false;
-            }
-        }
-
-        // Check for Fortran contiguity (if previous inputs were also F contiguous)
-        if (trivial_broadcast_f) {
-            ssize_t expect_stride = buffers[i].itemsize;
-            auto end = buffers[i].shape.cend();
-            for (auto shape_iter = buffers[i].shape.cbegin(), stride_iter = buffers[i].strides.cbegin();
-                    trivial_broadcast_f && shape_iter != end; ++shape_iter, ++stride_iter) {
-                if (expect_stride == *stride_iter)
-                    expect_stride *= *shape_iter;
-                else
-                    trivial_broadcast_f = false;
-            }
-        }
+    // Check for Fortran contiguity (if previous inputs were also F contiguous)
+    if (trivial_broadcast_f) {
+      ssize_t expect_stride = buffers[i].itemsize;
+      auto end = buffers[i].shape.cend();
+      for (auto shape_iter = buffers[i].shape.cbegin(),
+                stride_iter = buffers[i].strides.cbegin();
+           trivial_broadcast_f && shape_iter != end;
+           ++shape_iter, ++stride_iter) {
+        if (expect_stride == *stride_iter)
+          expect_stride *= *shape_iter;
+        else
+          trivial_broadcast_f = false;
+      }
     }
+  }
 
-    return
-        trivial_broadcast_c ? broadcast_trivial::c_trivial :
-        trivial_broadcast_f ? broadcast_trivial::f_trivial :
-        broadcast_trivial::non_trivial;
+  return trivial_broadcast_c
+             ? broadcast_trivial::c_trivial
+             : trivial_broadcast_f ? broadcast_trivial::f_trivial
+                                   : broadcast_trivial::non_trivial;
 }
 
-template <typename T>
-struct vectorize_arg {
-    static_assert(!std::is_rvalue_reference<T>::value, "Functions with rvalue reference arguments cannot be vectorized");
-    // The wrapped function gets called with this type:
-    using call_type = remove_reference_t<T>;
-    // Is this a vectorized argument?
-    static constexpr bool vectorize =
-        satisfies_any_of<call_type, std::is_arithmetic, is_complex, std::is_pod>::value &&
-        satisfies_none_of<call_type, std::is_pointer, std::is_array, is_std_array, std::is_enum>::value &&
-        (!std::is_reference<T>::value ||
-         (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
-    // Accept this type: an array for vectorized types, otherwise the type as-is:
-    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+template <typename T> struct vectorize_arg {
+  static_assert(
+      !std::is_rvalue_reference<T>::value,
+      "Functions with rvalue reference arguments cannot be vectorized");
+  // The wrapped function gets called with this type:
+  using call_type = remove_reference_t<T>;
+  // Is this a vectorized argument?
+  static constexpr bool vectorize =
+      satisfies_any_of<call_type, std::is_arithmetic, is_complex,
+                       std::is_pod>::value &&
+      satisfies_none_of<call_type, std::is_pointer, std::is_array, is_std_array,
+                        std::is_enum>::value &&
+      (!std::is_reference<T>::value ||
+       (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+  // Accept this type: an array for vectorized types, otherwise the type as-is:
+  using type =
+      conditional_t<vectorize,
+                    array_t<remove_cv_t<call_type>, array::forcecast>, T>;
 };
 
 template <typename Func, typename Return, typename... Args>
 struct vectorize_helper {
 private:
-    static constexpr size_t N = sizeof...(Args);
-    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
-    static_assert(NVectorized >= 1,
-            "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
+  static constexpr size_t N = sizeof...(Args);
+  static constexpr size_t NVectorized =
+      constexpr_sum(vectorize_arg<Args>::vectorize...);
+  static_assert(NVectorized >= 1,
+                "pybind11::vectorize(...) requires a function with at least "
+                "one vectorizable argument");
 
 public:
-    template <typename T>
-    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) { }
+  template <typename T>
+  explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) {}
 
-    object operator()(typename vectorize_arg<Args>::type... args) {
-        return run(args...,
-                   make_index_sequence<N>(),
-                   select_indices<vectorize_arg<Args>::vectorize...>(),
-                   make_index_sequence<NVectorized>());
-    }
+  object operator()(typename vectorize_arg<Args>::type... args) {
+    return run(args..., make_index_sequence<N>(),
+               select_indices<vectorize_arg<Args>::vectorize...>(),
+               make_index_sequence<NVectorized>());
+  }
 
 private:
-    remove_reference_t<Func> f;
+  remove_reference_t<Func> f;
 
-    // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling with "/permissive-" flag
-    // when arg_call_types is manually inlined.
-    using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
-    template <size_t Index> using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
+  // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4),
+  // when compiling with "/permissive-" flag when arg_call_types is manually
+  // inlined.
+  using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
+  template <size_t Index>
+  using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
 
-    // Runs a vectorized function given arguments tuple and three index sequences:
-    //     - Index is the full set of 0 ... (N-1) argument indices;
-    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
-    //       vectorized arguments (anything not in this sequence is passed through)
-    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
-    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
-    //       index BIndex in the array).
-    template <size_t... Index, size_t... VIndex, size_t... BIndex> object run(
-            typename vectorize_arg<Args>::type &...args,
-            index_sequence<Index...> i_seq, index_sequence<VIndex...> vi_seq, index_sequence<BIndex...> bi_seq) {
+  // Runs a vectorized function given arguments tuple and three index sequences:
+  //     - Index is the full set of 0 ... (N-1) argument indices;
+  //     - VIndex is the subset of argument indices with vectorized parameters,
+  //     letting us access
+  //       vectorized arguments (anything not in this sequence is passed
+  //       through)
+  //     - BIndex is a incremental sequence (beginning at 0) of the same size as
+  //     VIndex, so that
+  //       we can store vectorized buffer_infos in an array (argument VIndex has
+  //       its buffer at index BIndex in the array).
+  template <size_t... Index, size_t... VIndex, size_t... BIndex>
+  object run(typename vectorize_arg<Args>::type &... args,
+             index_sequence<Index...> i_seq, index_sequence<VIndex...> vi_seq,
+             index_sequence<BIndex...> bi_seq) {
 
-        // Pointers to values the function was called with; the vectorized ones set here will start
-        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
-        // call the wrapped function.  Non-vectorized pointers are left as-is.
-        std::array<void *, N> params{{ &args... }};
+    // Pointers to values the function was called with; the vectorized ones set
+    // here will start out as array_t<T> pointers, but they will be changed them
+    // to T pointers before we make call the wrapped function.  Non-vectorized
+    // pointers are left as-is.
+    std::array<void *, N> params{{&args...}};
 
-        // The array of `buffer_info`s of vectorized arguments:
-        std::array<buffer_info, NVectorized> buffers{{ reinterpret_cast<array *>(params[VIndex])->request()... }};
+    // The array of `buffer_info`s of vectorized arguments:
+    std::array<buffer_info, NVectorized> buffers{
+        {reinterpret_cast<array *>(params[VIndex])->request()...}};
 
-        /* Determine dimensions parameters of output array */
-        ssize_t nd = 0;
-        std::vector<ssize_t> shape(0);
-        auto trivial = broadcast(buffers, nd, shape);
-        size_t ndim = (size_t) nd;
+    /* Determine dimensions parameters of output array */
+    ssize_t nd = 0;
+    std::vector<ssize_t> shape(0);
+    auto trivial = broadcast(buffers, nd, shape);
+    size_t ndim = (size_t)nd;
 
-        size_t size = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+    size_t size = std::accumulate(shape.begin(), shape.end(), (size_t)1,
+                                  std::multiplies<size_t>());
 
-        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
-        // not wrapped in an array).
-        if (size == 1 && ndim == 0) {
-            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
-            return cast(f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...));
-        }
-
-        array_t<Return> result;
-        if (trivial == broadcast_trivial::f_trivial) result = array_t<Return, array::f_style>(shape);
-        else result = array_t<Return>(shape);
-
-        if (size == 0) return std::move(result);
-
-        /* Call the function */
-        if (trivial == broadcast_trivial::non_trivial)
-            apply_broadcast(buffers, params, result, i_seq, vi_seq, bi_seq);
-        else
-            apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq);
-
-        return std::move(result);
+    // If all arguments are 0-dimension arrays (i.e. single values) return a
+    // plain value (i.e. not wrapped in an array).
+    if (size == 1 && ndim == 0) {
+      PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+      return cast(f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...));
     }
 
-    template <size_t... Index, size_t... VIndex, size_t... BIndex>
-    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+    array_t<Return> result;
+    if (trivial == broadcast_trivial::f_trivial)
+      result = array_t<Return, array::f_style>(shape);
+    else
+      result = array_t<Return>(shape);
+
+    if (size == 0)
+      return std::move(result);
+
+    /* Call the function */
+    if (trivial == broadcast_trivial::non_trivial)
+      apply_broadcast(buffers, params, result, i_seq, vi_seq, bi_seq);
+    else
+      apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq,
+                    bi_seq);
+
+    return std::move(result);
+  }
+
+  template <size_t... Index, size_t... VIndex, size_t... BIndex>
+  void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                     std::array<void *, N> &params, Return *out, size_t size,
+                     index_sequence<Index...>, index_sequence<VIndex...>,
+                     index_sequence<BIndex...>) {
+
+    // Initialize an array of mutable byte references and sizes with references
+    // set to the appropriate pointer in `params`; as we iterate, we'll
+    // increment each pointer by its size (except for singletons, which get an
+    // increment of 0).
+    std::array<std::pair<unsigned char *&, const size_t>, NVectorized>
+        vecparams{{std::pair<unsigned char *&, const size_t>(
+            reinterpret_cast<unsigned char *&>(params[VIndex] =
+                                                   buffers[BIndex].ptr),
+            buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>))...}};
+
+    for (size_t i = 0; i < size; ++i) {
+      out[i] = f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+      for (auto &x : vecparams)
+        x.first += x.second;
+    }
+  }
+
+  template <size_t... Index, size_t... VIndex, size_t... BIndex>
+  void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
                        std::array<void *, N> &params,
-                       Return *out,
-                       size_t size,
-                       index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+                       array_t<Return> &output_array, index_sequence<Index...>,
+                       index_sequence<VIndex...>, index_sequence<BIndex...>) {
 
-        // Initialize an array of mutable byte references and sizes with references set to the
-        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
-        // (except for singletons, which get an increment of 0).
-        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{{
-            std::pair<unsigned char *&, const size_t>(
-                    reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
-                    buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>)
-            )...
-        }};
+    buffer_info output = output_array.request();
+    multi_array_iterator<NVectorized> input_iter(buffers, output.shape);
 
-        for (size_t i = 0; i < size; ++i) {
-            out[i] = f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...);
-            for (auto &x : vecparams) x.first += x.second;
-        }
-    }
-
-    template <size_t... Index, size_t... VIndex, size_t... BIndex>
-    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
-                         std::array<void *, N> &params,
-                         array_t<Return> &output_array,
-                         index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
-
-        buffer_info output = output_array.request();
-        multi_array_iterator<NVectorized> input_iter(buffers, output.shape);
-
-        for (array_iterator<Return> iter = array_begin<Return>(output), end = array_end<Return>(output);
-             iter != end;
-             ++iter, ++input_iter) {
-            PYBIND11_EXPAND_SIDE_EFFECTS((
-                params[VIndex] = input_iter.template data<BIndex>()
-            ));
-            *iter = f(*reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
-        }
+    for (array_iterator<Return> iter = array_begin<Return>(output),
+                                end = array_end<Return>(output);
+         iter != end; ++iter, ++input_iter) {
+      PYBIND11_EXPAND_SIDE_EFFECTS(
+          (params[VIndex] = input_iter.template data<BIndex>()));
+      *iter =
+          f(*reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
     }
+  }
 };
 
 template <typename Func, typename Return, typename... Args>
 vectorize_helper<Func, Return, Args...>
-vectorize_extractor(const Func &f, Return (*) (Args ...)) {
-    return detail::vectorize_helper<Func, Return, Args...>(f);
+vectorize_extractor(const Func &f, Return (*)(Args...)) {
+  return detail::vectorize_helper<Func, Return, Args...>(f);
 }
 
 template <typename T, int Flags> struct handle_type_name<array_t<T, Flags>> {
-    static constexpr auto name = _("numpy.ndarray[") + npy_format_descriptor<T>::name + _("]");
+  static constexpr auto name =
+      _("numpy.ndarray[") + npy_format_descriptor<T>::name + _("]");
 };
 
 NAMESPACE_END(detail)
@@ -1578,29 +1751,36 @@ NAMESPACE_END(detail)
 // Vanilla pointer vectorizer:
 template <typename Return, typename... Args>
 detail::vectorize_helper<Return (*)(Args...), Return, Args...>
-vectorize(Return (*f) (Args ...)) {
-    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
+vectorize(Return (*f)(Args...)) {
+  return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
 }
 
 // lambda vectorizer:
-template <typename Func, detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
-auto vectorize(Func &&f) -> decltype(
-        detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr)) {
-    return detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr);
+template <typename Func,
+          detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
+auto vectorize(Func &&f) -> decltype(detail::vectorize_extractor(
+    std::forward<Func>(f), (detail::function_signature_t<Func> *)nullptr)) {
+  return detail::vectorize_extractor(
+      std::forward<Func>(f), (detail::function_signature_t<Func> *)nullptr);
 }
 
 // Vectorize a class method (non-const):
 template <typename Return, typename Class, typename... Args,
-          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())), Return, Class *, Args...>>
+          typename Helper = detail::vectorize_helper<
+              decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())),
+              Return, Class *, Args...>>
 Helper vectorize(Return (Class::*f)(Args...)) {
-    return Helper(std::mem_fn(f));
+  return Helper(std::mem_fn(f));
 }
 
 // Vectorize a class method (const):
-template <typename Return, typename Class, typename... Args,
-          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())), Return, const Class *, Args...>>
+template <
+    typename Return, typename Class, typename... Args,
+    typename Helper = detail::vectorize_helper<
+        decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())),
+        Return, const Class *, Args...>>
 Helper vectorize(Return (Class::*f)(Args...) const) {
-    return Helper(std::mem_fn(f));
+  return Helper(std::mem_fn(f));
 }
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/operators.h b/python/src/pybind11/operators.h
index b3dd62c3b..1022048c3 100644
--- a/python/src/pybind11/operators.h
+++ b/python/src/pybind11/operators.h
@@ -12,10 +12,13 @@
 #include "pybind11.h"
 
 #if defined(__clang__) && !defined(__INTEL_COMPILER)
-#  pragma clang diagnostic ignored "-Wunsequenced" // multiple unsequenced modifications to 'self' (when using def(py::self OP Type()))
+#pragma clang diagnostic ignored                                               \
+    "-Wunsequenced" // multiple unsequenced modifications to 'self' (when using
+                    // def(py::self OP Type()))
 #elif defined(_MSC_VER)
-#  pragma warning(push)
-#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#pragma warning(push)
+#pragma warning(                                                               \
+    disable : 4127) // warning C4127: Conditional expression is constant
 #endif
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
@@ -23,136 +26,191 @@ NAMESPACE_BEGIN(detail)
 
 /// Enumeration with all supported operator types
 enum op_id : int {
-    op_add, op_sub, op_mul, op_div, op_mod, op_divmod, op_pow, op_lshift,
-    op_rshift, op_and, op_xor, op_or, op_neg, op_pos, op_abs, op_invert,
-    op_int, op_long, op_float, op_str, op_cmp, op_gt, op_ge, op_lt, op_le,
-    op_eq, op_ne, op_iadd, op_isub, op_imul, op_idiv, op_imod, op_ilshift,
-    op_irshift, op_iand, op_ixor, op_ior, op_complex, op_bool, op_nonzero,
-    op_repr, op_truediv, op_itruediv, op_hash
+  op_add,
+  op_sub,
+  op_mul,
+  op_div,
+  op_mod,
+  op_divmod,
+  op_pow,
+  op_lshift,
+  op_rshift,
+  op_and,
+  op_xor,
+  op_or,
+  op_neg,
+  op_pos,
+  op_abs,
+  op_invert,
+  op_int,
+  op_long,
+  op_float,
+  op_str,
+  op_cmp,
+  op_gt,
+  op_ge,
+  op_lt,
+  op_le,
+  op_eq,
+  op_ne,
+  op_iadd,
+  op_isub,
+  op_imul,
+  op_idiv,
+  op_imod,
+  op_ilshift,
+  op_irshift,
+  op_iand,
+  op_ixor,
+  op_ior,
+  op_complex,
+  op_bool,
+  op_nonzero,
+  op_repr,
+  op_truediv,
+  op_itruediv,
+  op_hash
 };
 
 enum op_type : int {
-    op_l, /* base type on left */
-    op_r, /* base type on right */
-    op_u  /* unary operator */
+  op_l, /* base type on left */
+  op_r, /* base type on right */
+  op_u  /* unary operator */
 };
 
-struct self_t { };
+struct self_t {};
 static const self_t self = self_t();
 
 /// Type for an unused type slot
-struct undefined_t { };
+struct undefined_t {};
 
 /// Don't warn about an unused variable
 inline self_t __self() { return self; }
 
 /// base template of operator implementations
-template <op_id, op_type, typename B, typename L, typename R> struct op_impl { };
+template <op_id, op_type, typename B, typename L, typename R> struct op_impl {};
 
 /// Operator implementation generator
 template <op_id id, op_type ot, typename L, typename R> struct op_ {
-    template <typename Class, typename... Extra> void execute(Class &cl, const Extra&... extra) const {
-        using Base = typename Class::type;
-        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
-        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
-        using op = op_impl<id, ot, Base, L_type, R_type>;
-        cl.def(op::name(), &op::execute, is_operator(), extra...);
-        #if PY_MAJOR_VERSION < 3
-        if (id == op_truediv || id == op_itruediv)
-            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
-                    &op::execute, is_operator(), extra...);
-        #endif
-    }
-    template <typename Class, typename... Extra> void execute_cast(Class &cl, const Extra&... extra) const {
-        using Base = typename Class::type;
-        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
-        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
-        using op = op_impl<id, ot, Base, L_type, R_type>;
-        cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
-        #if PY_MAJOR_VERSION < 3
-        if (id == op_truediv || id == op_itruediv)
-            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
-                    &op::execute, is_operator(), extra...);
-        #endif
-    }
+  template <typename Class, typename... Extra>
+  void execute(Class &cl, const Extra &... extra) const {
+    using Base = typename Class::type;
+    using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+    using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+    using op = op_impl<id, ot, Base, L_type, R_type>;
+    cl.def(op::name(), &op::execute, is_operator(), extra...);
+#if PY_MAJOR_VERSION < 3
+    if (id == op_truediv || id == op_itruediv)
+      cl.def(id == op_itruediv ? "__idiv__"
+                               : ot == op_l ? "__div__" : "__rdiv__",
+             &op::execute, is_operator(), extra...);
+#endif
+  }
+  template <typename Class, typename... Extra>
+  void execute_cast(Class &cl, const Extra &... extra) const {
+    using Base = typename Class::type;
+    using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+    using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+    using op = op_impl<id, ot, Base, L_type, R_type>;
+    cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
+#if PY_MAJOR_VERSION < 3
+    if (id == op_truediv || id == op_itruediv)
+      cl.def(id == op_itruediv ? "__idiv__"
+                               : ot == op_l ? "__div__" : "__rdiv__",
+             &op::execute, is_operator(), extra...);
+#endif
+  }
 };
 
-#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                    \
-template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
-    static char const* name() { return "__" #id "__"; }                                \
-    static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); }   \
-    static B execute_cast(const L &l, const R &r) { return B(expr); }                  \
-};                                                                                     \
-template <typename B, typename L, typename R> struct op_impl<op_##id, op_r, B, L, R> { \
-    static char const* name() { return "__" #rid "__"; }                               \
-    static auto execute(const R &r, const L &l) -> decltype(expr) { return (expr); }   \
-    static B execute_cast(const R &r, const L &l) { return B(expr); }                  \
-};                                                                                     \
-inline op_<op_##id, op_l, self_t, self_t> op(const self_t &, const self_t &) {         \
-    return op_<op_##id, op_l, self_t, self_t>();                                       \
-}                                                                                      \
-template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
-    return op_<op_##id, op_l, self_t, T>();                                            \
-}                                                                                      \
-template <typename T> op_<op_##id, op_r, T, self_t> op(const T &, const self_t &) {    \
-    return op_<op_##id, op_r, T, self_t>();                                            \
-}
+#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                            \
+  template <typename B, typename L, typename R>                                \
+  struct op_impl<op_##id, op_l, B, L, R> {                                     \
+    static char const *name() { return "__" #id "__"; }                        \
+    static auto execute(const L &l, const R &r) -> decltype(expr) {            \
+      return (expr);                                                           \
+    }                                                                          \
+    static B execute_cast(const L &l, const R &r) { return B(expr); }          \
+  };                                                                           \
+  template <typename B, typename L, typename R>                                \
+  struct op_impl<op_##id, op_r, B, L, R> {                                     \
+    static char const *name() { return "__" #rid "__"; }                       \
+    static auto execute(const R &r, const L &l) -> decltype(expr) {            \
+      return (expr);                                                           \
+    }                                                                          \
+    static B execute_cast(const R &r, const L &l) { return B(expr); }          \
+  };                                                                           \
+  inline op_<op_##id, op_l, self_t, self_t> op(const self_t &,                 \
+                                               const self_t &) {               \
+    return op_<op_##id, op_l, self_t, self_t>();                               \
+  }                                                                            \
+  template <typename T>                                                        \
+  op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {                \
+    return op_<op_##id, op_l, self_t, T>();                                    \
+  }                                                                            \
+  template <typename T>                                                        \
+  op_<op_##id, op_r, T, self_t> op(const T &, const self_t &) {                \
+    return op_<op_##id, op_r, T, self_t>();                                    \
+  }
 
-#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                        \
-template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
-    static char const* name() { return "__" #id "__"; }                                \
-    static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }           \
-    static B execute_cast(L &l, const R &r) { return B(expr); }                        \
-};                                                                                     \
-template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
-    return op_<op_##id, op_l, self_t, T>();                                            \
-}
+#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                \
+  template <typename B, typename L, typename R>                                \
+  struct op_impl<op_##id, op_l, B, L, R> {                                     \
+    static char const *name() { return "__" #id "__"; }                        \
+    static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }   \
+    static B execute_cast(L &l, const R &r) { return B(expr); }                \
+  };                                                                           \
+  template <typename T>                                                        \
+  op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {                \
+    return op_<op_##id, op_l, self_t, T>();                                    \
+  }
 
-#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                          \
-template <typename B, typename L> struct op_impl<op_##id, op_u, B, L, undefined_t> {   \
-    static char const* name() { return "__" #id "__"; }                                \
-    static auto execute(const L &l) -> decltype(expr) { return expr; }                 \
-    static B execute_cast(const L &l) { return B(expr); }                              \
-};                                                                                     \
-inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {                    \
-    return op_<op_##id, op_u, self_t, undefined_t>();                                  \
-}
+#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                  \
+  template <typename B, typename L>                                            \
+  struct op_impl<op_##id, op_u, B, L, undefined_t> {                           \
+    static char const *name() { return "__" #id "__"; }                        \
+    static auto execute(const L &l) -> decltype(expr) { return expr; }         \
+    static B execute_cast(const L &l) { return B(expr); }                      \
+  };                                                                           \
+  inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {          \
+    return op_<op_##id, op_u, self_t, undefined_t>();                          \
+  }
 
-PYBIND11_BINARY_OPERATOR(sub,       rsub,         operator-,    l - r)
-PYBIND11_BINARY_OPERATOR(add,       radd,         operator+,    l + r)
-PYBIND11_BINARY_OPERATOR(mul,       rmul,         operator*,    l * r)
-PYBIND11_BINARY_OPERATOR(truediv,   rtruediv,     operator/,    l / r)
-PYBIND11_BINARY_OPERATOR(mod,       rmod,         operator%,    l % r)
-PYBIND11_BINARY_OPERATOR(lshift,    rlshift,      operator<<,   l << r)
-PYBIND11_BINARY_OPERATOR(rshift,    rrshift,      operator>>,   l >> r)
-PYBIND11_BINARY_OPERATOR(and,       rand,         operator&,    l & r)
-PYBIND11_BINARY_OPERATOR(xor,       rxor,         operator^,    l ^ r)
-PYBIND11_BINARY_OPERATOR(eq,        eq,           operator==,   l == r)
-PYBIND11_BINARY_OPERATOR(ne,        ne,           operator!=,   l != r)
-PYBIND11_BINARY_OPERATOR(or,        ror,          operator|,    l | r)
-PYBIND11_BINARY_OPERATOR(gt,        lt,           operator>,    l > r)
-PYBIND11_BINARY_OPERATOR(ge,        le,           operator>=,   l >= r)
-PYBIND11_BINARY_OPERATOR(lt,        gt,           operator<,    l < r)
-PYBIND11_BINARY_OPERATOR(le,        ge,           operator<=,   l <= r)
-//PYBIND11_BINARY_OPERATOR(pow,       rpow,         pow,          std::pow(l,  r))
-PYBIND11_INPLACE_OPERATOR(iadd,     operator+=,   l += r)
-PYBIND11_INPLACE_OPERATOR(isub,     operator-=,   l -= r)
-PYBIND11_INPLACE_OPERATOR(imul,     operator*=,   l *= r)
-PYBIND11_INPLACE_OPERATOR(itruediv, operator/=,   l /= r)
-PYBIND11_INPLACE_OPERATOR(imod,     operator%=,   l %= r)
-PYBIND11_INPLACE_OPERATOR(ilshift,  operator<<=,  l <<= r)
-PYBIND11_INPLACE_OPERATOR(irshift,  operator>>=,  l >>= r)
-PYBIND11_INPLACE_OPERATOR(iand,     operator&=,   l &= r)
-PYBIND11_INPLACE_OPERATOR(ixor,     operator^=,   l ^= r)
-PYBIND11_INPLACE_OPERATOR(ior,      operator|=,   l |= r)
-PYBIND11_UNARY_OPERATOR(neg,        operator-,    -l)
-PYBIND11_UNARY_OPERATOR(pos,        operator+,    +l)
-PYBIND11_UNARY_OPERATOR(abs,        abs,          std::abs(l))
-PYBIND11_UNARY_OPERATOR(hash,       hash,         std::hash<L>()(l))
-PYBIND11_UNARY_OPERATOR(invert,     operator~,    (~l))
-PYBIND11_UNARY_OPERATOR(bool,       operator!,    !!l)
-PYBIND11_UNARY_OPERATOR(int,        int_,         (int) l)
-PYBIND11_UNARY_OPERATOR(float,      float_,       (double) l)
+PYBIND11_BINARY_OPERATOR(sub, rsub, operator-, l - r)
+PYBIND11_BINARY_OPERATOR(add, radd, operator+, l + r)
+PYBIND11_BINARY_OPERATOR(mul, rmul, operator*, l * r)
+PYBIND11_BINARY_OPERATOR(truediv, rtruediv, operator/, l / r)
+PYBIND11_BINARY_OPERATOR(mod, rmod, operator%, l % r)
+PYBIND11_BINARY_OPERATOR(lshift, rlshift, operator<<, l << r)
+PYBIND11_BINARY_OPERATOR(rshift, rrshift, operator>>, l>> r)
+PYBIND11_BINARY_OPERATOR(and, rand, operator&, l & r)
+PYBIND11_BINARY_OPERATOR(xor, rxor, operator^, l ^ r)
+PYBIND11_BINARY_OPERATOR(eq, eq, operator==, l == r)
+PYBIND11_BINARY_OPERATOR(ne, ne, operator!=, l != r)
+PYBIND11_BINARY_OPERATOR(or, ror, operator|, l | r)
+PYBIND11_BINARY_OPERATOR(gt, lt, operator>, l> r)
+PYBIND11_BINARY_OPERATOR(ge, le, operator>=, l >= r)
+PYBIND11_BINARY_OPERATOR(lt, gt, operator<, l<r)
+PYBIND11_BINARY_OPERATOR(le, ge, operator<=, l <= r)
+// PYBIND11_BINARY_OPERATOR(pow,       rpow,         pow,          std::pow(l,
+// r))
+PYBIND11_INPLACE_OPERATOR(iadd, operator+=, l += r)
+PYBIND11_INPLACE_OPERATOR(isub, operator-=, l -= r)
+PYBIND11_INPLACE_OPERATOR(imul, operator*=, l *= r)
+PYBIND11_INPLACE_OPERATOR(itruediv, operator/=, l /= r)
+PYBIND11_INPLACE_OPERATOR(imod, operator%=, l %= r)
+PYBIND11_INPLACE_OPERATOR(ilshift, operator<<=, l <<= r)
+PYBIND11_INPLACE_OPERATOR(irshift, operator>>=, l >>= r)
+PYBIND11_INPLACE_OPERATOR(iand, operator&=, l &= r)
+PYBIND11_INPLACE_OPERATOR(ixor, operator^=, l ^= r)
+PYBIND11_INPLACE_OPERATOR(ior, operator|=, l |= r)
+PYBIND11_UNARY_OPERATOR(neg, operator-, - l)
+PYBIND11_UNARY_OPERATOR(pos, operator+, + l)
+PYBIND11_UNARY_OPERATOR(abs, abs, std::abs(l))
+PYBIND11_UNARY_OPERATOR(hash, hash, std::hash<L>()(l))
+PYBIND11_UNARY_OPERATOR(invert, operator~,(~l))
+PYBIND11_UNARY_OPERATOR(bool, operator!, !!l)
+PYBIND11_UNARY_OPERATOR(int, int_, (int)l)
+PYBIND11_UNARY_OPERATOR(float, float_, (double)l)
 
 #undef PYBIND11_BINARY_OPERATOR
 #undef PYBIND11_INPLACE_OPERATOR
@@ -164,5 +222,5 @@ using detail::self;
 NAMESPACE_END(PYBIND11_NAMESPACE)
 
 #if defined(_MSC_VER)
-#  pragma warning(pop)
+#pragma warning(pop)
 #endif
diff --git a/python/src/pybind11/options.h b/python/src/pybind11/options.h
index cc1e1f6f0..17bb23ab2 100644
--- a/python/src/pybind11/options.h
+++ b/python/src/pybind11/options.h
@@ -15,51 +15,65 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
 class options {
 public:
+  // Default RAII constructor, which leaves settings as they currently are.
+  options() : previous_state(global_state()) {}
 
-    // Default RAII constructor, which leaves settings as they currently are.
-    options() : previous_state(global_state()) {}
+  // Class is non-copyable.
+  options(const options &) = delete;
+  options &operator=(const options &) = delete;
 
-    // Class is non-copyable.
-    options(const options&) = delete;
-    options& operator=(const options&) = delete;
+  // Destructor, which restores settings that were in effect before.
+  ~options() { global_state() = previous_state; }
 
-    // Destructor, which restores settings that were in effect before.
-    ~options() {
-        global_state() = previous_state;
-    }
+  // Setter methods (affect the global state):
 
-    // Setter methods (affect the global state):
+  options &disable_user_defined_docstrings() & {
+    global_state().show_user_defined_docstrings = false;
+    return *this;
+  }
 
-    options& disable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = false; return *this; }
+  options &enable_user_defined_docstrings() & {
+    global_state().show_user_defined_docstrings = true;
+    return *this;
+  }
 
-    options& enable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = true; return *this; }
+  options &disable_function_signatures() & {
+    global_state().show_function_signatures = false;
+    return *this;
+  }
 
-    options& disable_function_signatures() & { global_state().show_function_signatures = false; return *this; }
+  options &enable_function_signatures() & {
+    global_state().show_function_signatures = true;
+    return *this;
+  }
 
-    options& enable_function_signatures() & { global_state().show_function_signatures = true; return *this; }
+  // Getter methods (return the global state):
 
-    // Getter methods (return the global state):
+  static bool show_user_defined_docstrings() {
+    return global_state().show_user_defined_docstrings;
+  }
 
-    static bool show_user_defined_docstrings() { return global_state().show_user_defined_docstrings; }
+  static bool show_function_signatures() {
+    return global_state().show_function_signatures;
+  }
 
-    static bool show_function_signatures() { return global_state().show_function_signatures; }
-
-    // This type is not meant to be allocated on the heap.
-    void* operator new(size_t) = delete;
+  // This type is not meant to be allocated on the heap.
+  void *operator new(size_t) = delete;
 
 private:
+  struct state {
+    bool show_user_defined_docstrings =
+        true; //< Include user-supplied texts in docstrings.
+    bool show_function_signatures =
+        true; //< Include auto-generated function signatures in docstrings.
+  };
 
-    struct state {
-        bool show_user_defined_docstrings = true;  //< Include user-supplied texts in docstrings.
-        bool show_function_signatures = true;      //< Include auto-generated function signatures in docstrings.
-    };
+  static state &global_state() {
+    static state instance;
+    return instance;
+  }
 
-    static state &global_state() {
-        static state instance;
-        return instance;
-    }
-
-    state previous_state;
+  state previous_state;
 };
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/pybind11.h b/python/src/pybind11/pybind11.h
index f1d91c788..c1c8c8870 100644
--- a/python/src/pybind11/pybind11.h
+++ b/python/src/pybind11/pybind11.h
@@ -11,1006 +11,1143 @@
 #pragma once
 
 #if defined(__INTEL_COMPILER)
-#  pragma warning push
-#  pragma warning disable 68    // integer conversion resulted in a change of sign
-#  pragma warning disable 186   // pointless comparison of unsigned integer with zero
-#  pragma warning disable 878   // incompatible exception specifications
-#  pragma warning disable 1334  // the "template" keyword used for syntactic disambiguation may only be used within a template
-#  pragma warning disable 1682  // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
-#  pragma warning disable 1786  // function "strdup" was declared deprecated
-#  pragma warning disable 1875  // offsetof applied to non-POD (Plain Old Data) types is nonstandard
-#  pragma warning disable 2196  // warning #2196: routine is both "inline" and "noinline"
+#pragma warning push
+#pragma warning disable 68 // integer conversion resulted in a change of sign
+#pragma warning                                                                \
+    disable 186 // pointless comparison of unsigned integer with zero
+#pragma warning disable 878 // incompatible exception specifications
+#pragma warning                                                                \
+    disable 1334 // the "template" keyword used for syntactic disambiguation may
+                 // only be used within a template
+#pragma warning                                                                \
+    disable 1682 // implicit conversion of a 64-bit integral type to a smaller
+                 // integral type (potential portability problem)
+#pragma warning disable 1786 // function "strdup" was declared deprecated
+#pragma warning disable 1875 // offsetof applied to non-POD (Plain Old Data)
+                             // types is nonstandard
+#pragma warning                                                                \
+    disable 2196 // warning #2196: routine is both "inline" and "noinline"
 #elif defined(_MSC_VER)
-#  pragma warning(push)
-#  pragma warning(disable: 4100) // warning C4100: Unreferenced formal parameter
-#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
-#  pragma warning(disable: 4512) // warning C4512: Assignment operator was implicitly defined as deleted
-#  pragma warning(disable: 4800) // warning C4800: 'int': forcing value to bool 'true' or 'false' (performance warning)
-#  pragma warning(disable: 4996) // warning C4996: The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name
-#  pragma warning(disable: 4702) // warning C4702: unreachable code
-#  pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified
+#pragma warning(push)
+#pragma warning(disable : 4100) // warning C4100: Unreferenced formal parameter
+#pragma warning(                                                               \
+    disable : 4127) // warning C4127: Conditional expression is constant
+#pragma warning(disable : 4512) // warning C4512: Assignment operator was
+                                // implicitly defined as deleted
+#pragma warning(disable : 4800) // warning C4800: 'int': forcing value to bool
+                                // 'true' or 'false' (performance warning)
+#pragma warning(disable : 4996) // warning C4996: The POSIX name for this item
+                                // is deprecated. Instead, use the ISO C and C++
+                                // conformant name
+#pragma warning(disable : 4702) // warning C4702: unreachable code
+#pragma warning(                                                               \
+    disable : 4522) // warning C4522: multiple assignment operators specified
 #elif defined(__GNUG__) && !defined(__clang__)
-#  pragma GCC diagnostic push
-#  pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
-#  pragma GCC diagnostic ignored "-Wunused-but-set-variable"
-#  pragma GCC diagnostic ignored "-Wmissing-field-initializers"
-#  pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#  pragma GCC diagnostic ignored "-Wattributes"
-#  if __GNUC__ >= 7
-#    pragma GCC diagnostic ignored "-Wnoexcept-type"
-#  endif
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wattributes"
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wnoexcept-type"
+#endif
 #endif
 
 #if defined(__GNUG__) && !defined(__clang__)
- #include <cxxabi.h>
+#include <cxxabi.h>
 #endif
 
-
 #include "attr.h"
-#include "options.h"
 #include "detail/class.h"
 #include "detail/init.h"
+#include "options.h"
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
-/// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object
+/// Wraps an arbitrary C++ function/method/lambda function/.. into a callable
+/// Python object
 class cpp_function : public function {
 public:
-    cpp_function() { }
-    cpp_function(std::nullptr_t) { }
+  cpp_function() {}
+  cpp_function(std::nullptr_t) {}
 
-    /// Construct a cpp_function from a vanilla function pointer
-    template <typename Return, typename... Args, typename... Extra>
-    cpp_function(Return (*f)(Args...), const Extra&... extra) {
-        initialize(f, f, extra...);
-    }
+  /// Construct a cpp_function from a vanilla function pointer
+  template <typename Return, typename... Args, typename... Extra>
+  cpp_function(Return (*f)(Args...), const Extra &... extra) {
+    initialize(f, f, extra...);
+  }
 
-    /// Construct a cpp_function from a lambda function (possibly with internal state)
-    template <typename Func, typename... Extra,
-              typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
-    cpp_function(Func &&f, const Extra&... extra) {
-        initialize(std::forward<Func>(f),
-                   (detail::function_signature_t<Func> *) nullptr, extra...);
-    }
+  /// Construct a cpp_function from a lambda function (possibly with internal
+  /// state)
+  template <typename Func, typename... Extra,
+            typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
+  cpp_function(Func &&f, const Extra &... extra) {
+    initialize(std::forward<Func>(f),
+               (detail::function_signature_t<Func> *)nullptr, extra...);
+  }
 
-    /// Construct a cpp_function from a class method (non-const)
-    template <typename Return, typename Class, typename... Arg, typename... Extra>
-    cpp_function(Return (Class::*f)(Arg...), const Extra&... extra) {
-        initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(args...); },
-                   (Return (*) (Class *, Arg...)) nullptr, extra...);
-    }
+  /// Construct a cpp_function from a class method (non-const)
+  template <typename Return, typename Class, typename... Arg, typename... Extra>
+  cpp_function(Return (Class::*f)(Arg...), const Extra &... extra) {
+    initialize(
+        [f](Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+        (Return(*)(Class *, Arg...)) nullptr, extra...);
+  }
 
-    /// Construct a cpp_function from a class method (const)
-    template <typename Return, typename Class, typename... Arg, typename... Extra>
-    cpp_function(Return (Class::*f)(Arg...) const, const Extra&... extra) {
-        initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(args...); },
-                   (Return (*)(const Class *, Arg ...)) nullptr, extra...);
-    }
+  /// Construct a cpp_function from a class method (const)
+  template <typename Return, typename Class, typename... Arg, typename... Extra>
+  cpp_function(Return (Class::*f)(Arg...) const, const Extra &... extra) {
+    initialize(
+        [f](const Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+        (Return(*)(const Class *, Arg...)) nullptr, extra...);
+  }
 
-    /// Return the function name
-    object name() const { return attr("__name__"); }
+  /// Return the function name
+  object name() const { return attr("__name__"); }
 
 protected:
-    /// Space optimization: don't inline this frequently instantiated fragment
-    PYBIND11_NOINLINE detail::function_record *make_function_record() {
-        return new detail::function_record();
-    }
+  /// Space optimization: don't inline this frequently instantiated fragment
+  PYBIND11_NOINLINE detail::function_record *make_function_record() {
+    return new detail::function_record();
+  }
 
-    /// Special internal constructor for functors, lambda functions, etc.
-    template <typename Func, typename Return, typename... Args, typename... Extra>
-    void initialize(Func &&f, Return (*)(Args...), const Extra&... extra) {
-        using namespace detail;
-        struct capture { remove_reference_t<Func> f; };
+  /// Special internal constructor for functors, lambda functions, etc.
+  template <typename Func, typename Return, typename... Args, typename... Extra>
+  void initialize(Func &&f, Return (*)(Args...), const Extra &... extra) {
+    using namespace detail;
+    struct capture {
+      remove_reference_t<Func> f;
+    };
 
-        /* Store the function including any extra state it might have (e.g. a lambda capture object) */
-        auto rec = make_function_record();
+    /* Store the function including any extra state it might have (e.g. a lambda
+     * capture object) */
+    auto rec = make_function_record();
 
-        /* Store the capture object directly in the function record if there is enough space */
-        if (sizeof(capture) <= sizeof(rec->data)) {
-            /* Without these pragmas, GCC warns that there might not be
-               enough space to use the placement new operator. However, the
-               'if' statement above ensures that this is the case. */
+    /* Store the capture object directly in the function record if there is
+     * enough space */
+    if (sizeof(capture) <= sizeof(rec->data)) {
+      /* Without these pragmas, GCC warns that there might not be
+         enough space to use the placement new operator. However, the
+         'if' statement above ensures that this is the case. */
 #if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
-#  pragma GCC diagnostic push
-#  pragma GCC diagnostic ignored "-Wplacement-new"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wplacement-new"
 #endif
-            new ((capture *) &rec->data) capture { std::forward<Func>(f) };
+      new ((capture *)&rec->data) capture{std::forward<Func>(f)};
 #if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
-#  pragma GCC diagnostic pop
+#pragma GCC diagnostic pop
 #endif
-            if (!std::is_trivially_destructible<Func>::value)
-                rec->free_data = [](function_record *r) { ((capture *) &r->data)->~capture(); };
-        } else {
-            rec->data[0] = new capture { std::forward<Func>(f) };
-            rec->free_data = [](function_record *r) { delete ((capture *) r->data[0]); };
-        }
-
-        /* Type casters for the function arguments and return value */
-        using cast_in = argument_loader<Args...>;
-        using cast_out = make_caster<
-            conditional_t<std::is_void<Return>::value, void_type, Return>
-        >;
-
-        static_assert(expected_num_args<Extra...>(sizeof...(Args), cast_in::has_args, cast_in::has_kwargs),
-                      "The number of argument annotations does not match the number of function arguments");
-
-        /* Dispatch code which converts function arguments and performs the actual function call */
-        rec->impl = [](function_call &call) -> handle {
-            cast_in args_converter;
-
-            /* Try to cast the function arguments into the C++ domain */
-            if (!args_converter.load_args(call))
-                return PYBIND11_TRY_NEXT_OVERLOAD;
-
-            /* Invoke call policy pre-call hook */
-            process_attributes<Extra...>::precall(call);
-
-            /* Get a pointer to the capture object */
-            auto data = (sizeof(capture) <= sizeof(call.func.data)
-                         ? &call.func.data : call.func.data[0]);
-            capture *cap = const_cast<capture *>(reinterpret_cast<const capture *>(data));
-
-            /* Override policy for rvalues -- usually to enforce rvp::move on an rvalue */
-            return_value_policy policy = return_value_policy_override<Return>::policy(call.func.policy);
-
-            /* Function scope guard -- defaults to the compile-to-nothing `void_type` */
-            using Guard = extract_guard_t<Extra...>;
-
-            /* Perform the function call */
-            handle result = cast_out::cast(
-                std::move(args_converter).template call<Return, Guard>(cap->f), policy, call.parent);
-
-            /* Invoke call policy post-call hook */
-            process_attributes<Extra...>::postcall(call, result);
-
-            return result;
+      if (!std::is_trivially_destructible<Func>::value)
+        rec->free_data = [](function_record *r) {
+          ((capture *)&r->data)->~capture();
         };
-
-        /* Process any user-provided function attributes */
-        process_attributes<Extra...>::init(extra..., rec);
-
-        /* Generate a readable signature describing the function's arguments and return value types */
-        static constexpr auto signature = _("(") + cast_in::arg_names + _(") -> ") + cast_out::name;
-        PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types();
-
-        /* Register the function with Python from generic (non-templated) code */
-        initialize_generic(rec, signature.text, types.data(), sizeof...(Args));
-
-        if (cast_in::has_args) rec->has_args = true;
-        if (cast_in::has_kwargs) rec->has_kwargs = true;
-
-        /* Stash some additional information used by an important optimization in 'functional.h' */
-        using FunctionType = Return (*)(Args...);
-        constexpr bool is_function_ptr =
-            std::is_convertible<Func, FunctionType>::value &&
-            sizeof(capture) == sizeof(void *);
-        if (is_function_ptr) {
-            rec->is_stateless = true;
-            rec->data[1] = const_cast<void *>(reinterpret_cast<const void *>(&typeid(FunctionType)));
-        }
+    } else {
+      rec->data[0] = new capture{std::forward<Func>(f)};
+      rec->free_data = [](function_record *r) {
+        delete ((capture *)r->data[0]);
+      };
     }
 
-    /// Register a function call with Python (generic non-templated code goes here)
-    void initialize_generic(detail::function_record *rec, const char *text,
-                            const std::type_info *const *types, size_t args) {
+    /* Type casters for the function arguments and return value */
+    using cast_in = argument_loader<Args...>;
+    using cast_out = make_caster<
+        conditional_t<std::is_void<Return>::value, void_type, Return>>;
 
-        /* Create copies of all referenced C-style strings */
-        rec->name = strdup(rec->name ? rec->name : "");
-        if (rec->doc) rec->doc = strdup(rec->doc);
-        for (auto &a: rec->args) {
-            if (a.name)
-                a.name = strdup(a.name);
-            if (a.descr)
-                a.descr = strdup(a.descr);
-            else if (a.value)
-                a.descr = strdup(a.value.attr("__repr__")().cast<std::string>().c_str());
-        }
+    static_assert(expected_num_args<Extra...>(
+                      sizeof...(Args), cast_in::has_args, cast_in::has_kwargs),
+                  "The number of argument annotations does not match the "
+                  "number of function arguments");
 
-        rec->is_constructor = !strcmp(rec->name, "__init__") || !strcmp(rec->name, "__setstate__");
+    /* Dispatch code which converts function arguments and performs the actual
+     * function call */
+    rec->impl = [](function_call &call) -> handle {
+      cast_in args_converter;
+
+      /* Try to cast the function arguments into the C++ domain */
+      if (!args_converter.load_args(call))
+        return PYBIND11_TRY_NEXT_OVERLOAD;
+
+      /* Invoke call policy pre-call hook */
+      process_attributes<Extra...>::precall(call);
+
+      /* Get a pointer to the capture object */
+      auto data =
+          (sizeof(capture) <= sizeof(call.func.data) ? &call.func.data
+                                                     : call.func.data[0]);
+      capture *cap =
+          const_cast<capture *>(reinterpret_cast<const capture *>(data));
+
+      /* Override policy for rvalues -- usually to enforce rvp::move on an
+       * rvalue */
+      return_value_policy policy =
+          return_value_policy_override<Return>::policy(call.func.policy);
+
+      /* Function scope guard -- defaults to the compile-to-nothing `void_type`
+       */
+      using Guard = extract_guard_t<Extra...>;
+
+      /* Perform the function call */
+      handle result = cast_out::cast(
+          std::move(args_converter).template call<Return, Guard>(cap->f),
+          policy, call.parent);
+
+      /* Invoke call policy post-call hook */
+      process_attributes<Extra...>::postcall(call, result);
+
+      return result;
+    };
+
+    /* Process any user-provided function attributes */
+    process_attributes<Extra...>::init(extra..., rec);
+
+    /* Generate a readable signature describing the function's arguments and
+     * return value types */
+    static constexpr auto signature =
+        _("(") + cast_in::arg_names + _(") -> ") + cast_out::name;
+    PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types();
+
+    /* Register the function with Python from generic (non-templated) code */
+    initialize_generic(rec, signature.text, types.data(), sizeof...(Args));
+
+    if (cast_in::has_args)
+      rec->has_args = true;
+    if (cast_in::has_kwargs)
+      rec->has_kwargs = true;
+
+    /* Stash some additional information used by an important optimization in
+     * 'functional.h' */
+    using FunctionType = Return (*)(Args...);
+    constexpr bool is_function_ptr =
+        std::is_convertible<Func, FunctionType>::value &&
+        sizeof(capture) == sizeof(void *);
+    if (is_function_ptr) {
+      rec->is_stateless = true;
+      rec->data[1] = const_cast<void *>(
+          reinterpret_cast<const void *>(&typeid(FunctionType)));
+    }
+  }
+
+  /// Register a function call with Python (generic non-templated code goes
+  /// here)
+  void initialize_generic(detail::function_record *rec, const char *text,
+                          const std::type_info *const *types, size_t args) {
+
+    /* Create copies of all referenced C-style strings */
+    rec->name = strdup(rec->name ? rec->name : "");
+    if (rec->doc)
+      rec->doc = strdup(rec->doc);
+    for (auto &a : rec->args) {
+      if (a.name)
+        a.name = strdup(a.name);
+      if (a.descr)
+        a.descr = strdup(a.descr);
+      else if (a.value)
+        a.descr =
+            strdup(a.value.attr("__repr__")().cast<std::string>().c_str());
+    }
+
+    rec->is_constructor =
+        !strcmp(rec->name, "__init__") || !strcmp(rec->name, "__setstate__");
 
 #if !defined(NDEBUG) && !defined(PYBIND11_DISABLE_NEW_STYLE_INIT_WARNING)
-        if (rec->is_constructor && !rec->is_new_style_constructor) {
-            const auto class_name = std::string(((PyTypeObject *) rec->scope.ptr())->tp_name);
-            const auto func_name = std::string(rec->name);
-            PyErr_WarnEx(
-                PyExc_FutureWarning,
-                ("pybind11-bound class '" + class_name + "' is using an old-style "
-                 "placement-new '" + func_name + "' which has been deprecated. See "
-                 "the upgrade guide in pybind11's docs. This message is only visible "
-                 "when compiled in debug mode.").c_str(), 0
-            );
-        }
+    if (rec->is_constructor && !rec->is_new_style_constructor) {
+      const auto class_name =
+          std::string(((PyTypeObject *)rec->scope.ptr())->tp_name);
+      const auto func_name = std::string(rec->name);
+      PyErr_WarnEx(
+          PyExc_FutureWarning,
+          ("pybind11-bound class '" + class_name +
+           "' is using an old-style "
+           "placement-new '" +
+           func_name +
+           "' which has been deprecated. See "
+           "the upgrade guide in pybind11's docs. This message is only visible "
+           "when compiled in debug mode.")
+              .c_str(),
+          0);
+    }
 #endif
 
-        /* Generate a proper function signature */
-        std::string signature;
-        size_t type_index = 0, arg_index = 0;
-        for (auto *pc = text; *pc != '\0'; ++pc) {
-            const auto c = *pc;
+    /* Generate a proper function signature */
+    std::string signature;
+    size_t type_index = 0, arg_index = 0;
+    for (auto *pc = text; *pc != '\0'; ++pc) {
+      const auto c = *pc;
 
-            if (c == '{') {
-                // Write arg name for everything except *args and **kwargs.
-                if (*(pc + 1) == '*')
-                    continue;
+      if (c == '{') {
+        // Write arg name for everything except *args and **kwargs.
+        if (*(pc + 1) == '*')
+          continue;
 
-                if (arg_index < rec->args.size() && rec->args[arg_index].name) {
-                    signature += rec->args[arg_index].name;
-                } else if (arg_index == 0 && rec->is_method) {
-                    signature += "self";
-                } else {
-                    signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0));
-                }
-                signature += ": ";
-            } else if (c == '}') {
-                // Write default value if available.
-                if (arg_index < rec->args.size() && rec->args[arg_index].descr) {
-                    signature += " = ";
-                    signature += rec->args[arg_index].descr;
-                }
-                arg_index++;
-            } else if (c == '%') {
-                const std::type_info *t = types[type_index++];
-                if (!t)
-                    pybind11_fail("Internal error while parsing type signature (1)");
-                if (auto tinfo = detail::get_type_info(*t)) {
-                    handle th((PyObject *) tinfo->type);
-                    signature +=
-                        th.attr("__module__").cast<std::string>() + "." +
-                        th.attr("__qualname__").cast<std::string>(); // Python 3.3+, but we backport it to earlier versions
-                } else if (rec->is_new_style_constructor && arg_index == 0) {
-                    // A new-style `__init__` takes `self` as `value_and_holder`.
-                    // Rewrite it to the proper class type.
-                    signature +=
-                        rec->scope.attr("__module__").cast<std::string>() + "." +
-                        rec->scope.attr("__qualname__").cast<std::string>();
-                } else {
-                    std::string tname(t->name());
-                    detail::clean_type_id(tname);
-                    signature += tname;
-                }
-            } else {
-                signature += c;
-            }
+        if (arg_index < rec->args.size() && rec->args[arg_index].name) {
+          signature += rec->args[arg_index].name;
+        } else if (arg_index == 0 && rec->is_method) {
+          signature += "self";
+        } else {
+          signature +=
+              "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0));
         }
-        if (arg_index != args || types[type_index] != nullptr)
-            pybind11_fail("Internal error while parsing type signature (2)");
+        signature += ": ";
+      } else if (c == '}') {
+        // Write default value if available.
+        if (arg_index < rec->args.size() && rec->args[arg_index].descr) {
+          signature += " = ";
+          signature += rec->args[arg_index].descr;
+        }
+        arg_index++;
+      } else if (c == '%') {
+        const std::type_info *t = types[type_index++];
+        if (!t)
+          pybind11_fail("Internal error while parsing type signature (1)");
+        if (auto tinfo = detail::get_type_info(*t)) {
+          handle th((PyObject *)tinfo->type);
+          signature += th.attr("__module__").cast<std::string>() + "." +
+                       th.attr("__qualname__")
+                           .cast<std::string>(); // Python 3.3+, but we backport
+                                                 // it to earlier versions
+        } else if (rec->is_new_style_constructor && arg_index == 0) {
+          // A new-style `__init__` takes `self` as `value_and_holder`.
+          // Rewrite it to the proper class type.
+          signature += rec->scope.attr("__module__").cast<std::string>() + "." +
+                       rec->scope.attr("__qualname__").cast<std::string>();
+        } else {
+          std::string tname(t->name());
+          detail::clean_type_id(tname);
+          signature += tname;
+        }
+      } else {
+        signature += c;
+      }
+    }
+    if (arg_index != args || types[type_index] != nullptr)
+      pybind11_fail("Internal error while parsing type signature (2)");
 
 #if PY_MAJOR_VERSION < 3
-        if (strcmp(rec->name, "__next__") == 0) {
-            std::free(rec->name);
-            rec->name = strdup("next");
-        } else if (strcmp(rec->name, "__bool__") == 0) {
-            std::free(rec->name);
-            rec->name = strdup("__nonzero__");
-        }
+    if (strcmp(rec->name, "__next__") == 0) {
+      std::free(rec->name);
+      rec->name = strdup("next");
+    } else if (strcmp(rec->name, "__bool__") == 0) {
+      std::free(rec->name);
+      rec->name = strdup("__nonzero__");
+    }
 #endif
-        rec->signature = strdup(signature.c_str());
-        rec->args.shrink_to_fit();
-        rec->nargs = (std::uint16_t) args;
+    rec->signature = strdup(signature.c_str());
+    rec->args.shrink_to_fit();
+    rec->nargs = (std::uint16_t)args;
 
-        if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr()))
-            rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr());
+    if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr()))
+      rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr());
 
-        detail::function_record *chain = nullptr, *chain_start = rec;
-        if (rec->sibling) {
-            if (PyCFunction_Check(rec->sibling.ptr())) {
-                auto rec_capsule = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(rec->sibling.ptr()));
-                chain = (detail::function_record *) rec_capsule;
-                /* Never append a method to an overload chain of a parent class;
-                   instead, hide the parent's overloads in this case */
-                if (!chain->scope.is(rec->scope))
-                    chain = nullptr;
-            }
-            // Don't trigger for things like the default __init__, which are wrapper_descriptors that we are intentionally replacing
-            else if (!rec->sibling.is_none() && rec->name[0] != '_')
-                pybind11_fail("Cannot overload existing non-function object \"" + std::string(rec->name) +
-                        "\" with a function of the same name");
-        }
-
-        if (!chain) {
-            /* No existing overload was found, create a new function object */
-            rec->def = new PyMethodDef();
-            std::memset(rec->def, 0, sizeof(PyMethodDef));
-            rec->def->ml_name = rec->name;
-            rec->def->ml_meth = reinterpret_cast<PyCFunction>(reinterpret_cast<void (*) (void)>(*dispatcher));
-            rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
-
-            capsule rec_capsule(rec, [](void *ptr) {
-                destruct((detail::function_record *) ptr);
-            });
-
-            object scope_module;
-            if (rec->scope) {
-                if (hasattr(rec->scope, "__module__")) {
-                    scope_module = rec->scope.attr("__module__");
-                } else if (hasattr(rec->scope, "__name__")) {
-                    scope_module = rec->scope.attr("__name__");
-                }
-            }
-
-            m_ptr = PyCFunction_NewEx(rec->def, rec_capsule.ptr(), scope_module.ptr());
-            if (!m_ptr)
-                pybind11_fail("cpp_function::cpp_function(): Could not allocate function object");
-        } else {
-            /* Append at the end of the overload chain */
-            m_ptr = rec->sibling.ptr();
-            inc_ref();
-            chain_start = chain;
-            if (chain->is_method != rec->is_method)
-                pybind11_fail("overloading a method with both static and instance methods is not supported; "
-                    #if defined(NDEBUG)
-                        "compile in debug mode for more details"
-                    #else
-                        "error while attempting to bind " + std::string(rec->is_method ? "instance" : "static") + " method " +
-                        std::string(pybind11::str(rec->scope.attr("__name__"))) + "." + std::string(rec->name) + signature
-                    #endif
-                );
-            while (chain->next)
-                chain = chain->next;
-            chain->next = rec;
-        }
-
-        std::string signatures;
-        int index = 0;
-        /* Create a nice pydoc rec including all signatures and
-           docstrings of the functions in the overload chain */
-        if (chain && options::show_function_signatures()) {
-            // First a generic signature
-            signatures += rec->name;
-            signatures += "(*args, **kwargs)\n";
-            signatures += "Overloaded function.\n\n";
-        }
-        // Then specific overload signatures
-        bool first_user_def = true;
-        for (auto it = chain_start; it != nullptr; it = it->next) {
-            if (options::show_function_signatures()) {
-                if (index > 0) signatures += "\n";
-                if (chain)
-                    signatures += std::to_string(++index) + ". ";
-                signatures += rec->name;
-                signatures += it->signature;
-                signatures += "\n";
-            }
-            if (it->doc && strlen(it->doc) > 0 && options::show_user_defined_docstrings()) {
-                // If we're appending another docstring, and aren't printing function signatures, we
-                // need to append a newline first:
-                if (!options::show_function_signatures()) {
-                    if (first_user_def) first_user_def = false;
-                    else signatures += "\n";
-                }
-                if (options::show_function_signatures()) signatures += "\n";
-                signatures += it->doc;
-                if (options::show_function_signatures()) signatures += "\n";
-            }
-        }
-
-        /* Install docstring */
-        PyCFunctionObject *func = (PyCFunctionObject *) m_ptr;
-        if (func->m_ml->ml_doc)
-            std::free(const_cast<char *>(func->m_ml->ml_doc));
-        func->m_ml->ml_doc = strdup(signatures.c_str());
-
-        if (rec->is_method) {
-            m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr());
-            if (!m_ptr)
-                pybind11_fail("cpp_function::cpp_function(): Could not allocate instance method object");
-            Py_DECREF(func);
-        }
+    detail::function_record *chain = nullptr, *chain_start = rec;
+    if (rec->sibling) {
+      if (PyCFunction_Check(rec->sibling.ptr())) {
+        auto rec_capsule = reinterpret_borrow<capsule>(
+            PyCFunction_GET_SELF(rec->sibling.ptr()));
+        chain = (detail::function_record *)rec_capsule;
+        /* Never append a method to an overload chain of a parent class;
+           instead, hide the parent's overloads in this case */
+        if (!chain->scope.is(rec->scope))
+          chain = nullptr;
+      }
+      // Don't trigger for things like the default __init__, which are
+      // wrapper_descriptors that we are intentionally replacing
+      else if (!rec->sibling.is_none() && rec->name[0] != '_')
+        pybind11_fail("Cannot overload existing non-function object \"" +
+                      std::string(rec->name) +
+                      "\" with a function of the same name");
     }
 
-    /// When a cpp_function is GCed, release any memory allocated by pybind11
-    static void destruct(detail::function_record *rec) {
-        while (rec) {
-            detail::function_record *next = rec->next;
-            if (rec->free_data)
-                rec->free_data(rec);
-            std::free((char *) rec->name);
-            std::free((char *) rec->doc);
-            std::free((char *) rec->signature);
-            for (auto &arg: rec->args) {
-                std::free(const_cast<char *>(arg.name));
-                std::free(const_cast<char *>(arg.descr));
-                arg.value.dec_ref();
-            }
-            if (rec->def) {
-                std::free(const_cast<char *>(rec->def->ml_doc));
-                delete rec->def;
-            }
-            delete rec;
-            rec = next;
+    if (!chain) {
+      /* No existing overload was found, create a new function object */
+      rec->def = new PyMethodDef();
+      std::memset(rec->def, 0, sizeof(PyMethodDef));
+      rec->def->ml_name = rec->name;
+      rec->def->ml_meth = reinterpret_cast<PyCFunction>(
+          reinterpret_cast<void (*)(void)>(*dispatcher));
+      rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
+
+      capsule rec_capsule(
+          rec, [](void *ptr) { destruct((detail::function_record *)ptr); });
+
+      object scope_module;
+      if (rec->scope) {
+        if (hasattr(rec->scope, "__module__")) {
+          scope_module = rec->scope.attr("__module__");
+        } else if (hasattr(rec->scope, "__name__")) {
+          scope_module = rec->scope.attr("__name__");
         }
+      }
+
+      m_ptr =
+          PyCFunction_NewEx(rec->def, rec_capsule.ptr(), scope_module.ptr());
+      if (!m_ptr)
+        pybind11_fail(
+            "cpp_function::cpp_function(): Could not allocate function object");
+    } else {
+      /* Append at the end of the overload chain */
+      m_ptr = rec->sibling.ptr();
+      inc_ref();
+      chain_start = chain;
+      if (chain->is_method != rec->is_method)
+        pybind11_fail("overloading a method with both static and instance "
+                      "methods is not supported; "
+#if defined(NDEBUG)
+                      "compile in debug mode for more details"
+#else
+                      "error while attempting to bind " +
+                      std::string(rec->is_method ? "instance" : "static") +
+                      " method " +
+                      std::string(pybind11::str(rec->scope.attr("__name__"))) +
+                      "." + std::string(rec->name) + signature
+#endif
+        );
+      while (chain->next)
+        chain = chain->next;
+      chain->next = rec;
     }
 
-    /// Main dispatch logic for calls to functions bound using pybind11
-    static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) {
-        using namespace detail;
+    std::string signatures;
+    int index = 0;
+    /* Create a nice pydoc rec including all signatures and
+       docstrings of the functions in the overload chain */
+    if (chain && options::show_function_signatures()) {
+      // First a generic signature
+      signatures += rec->name;
+      signatures += "(*args, **kwargs)\n";
+      signatures += "Overloaded function.\n\n";
+    }
+    // Then specific overload signatures
+    bool first_user_def = true;
+    for (auto it = chain_start; it != nullptr; it = it->next) {
+      if (options::show_function_signatures()) {
+        if (index > 0)
+          signatures += "\n";
+        if (chain)
+          signatures += std::to_string(++index) + ". ";
+        signatures += rec->name;
+        signatures += it->signature;
+        signatures += "\n";
+      }
+      if (it->doc && strlen(it->doc) > 0 &&
+          options::show_user_defined_docstrings()) {
+        // If we're appending another docstring, and aren't printing function
+        // signatures, we need to append a newline first:
+        if (!options::show_function_signatures()) {
+          if (first_user_def)
+            first_user_def = false;
+          else
+            signatures += "\n";
+        }
+        if (options::show_function_signatures())
+          signatures += "\n";
+        signatures += it->doc;
+        if (options::show_function_signatures())
+          signatures += "\n";
+      }
+    }
 
-        /* Iterator over the list of potentially admissible overloads */
-        const function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr),
-                              *it = overloads;
+    /* Install docstring */
+    PyCFunctionObject *func = (PyCFunctionObject *)m_ptr;
+    if (func->m_ml->ml_doc)
+      std::free(const_cast<char *>(func->m_ml->ml_doc));
+    func->m_ml->ml_doc = strdup(signatures.c_str());
 
-        /* Need to know how many arguments + keyword arguments there are to pick the right overload */
-        const size_t n_args_in = (size_t) PyTuple_GET_SIZE(args_in);
+    if (rec->is_method) {
+      m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr());
+      if (!m_ptr)
+        pybind11_fail("cpp_function::cpp_function(): Could not allocate "
+                      "instance method object");
+      Py_DECREF(func);
+    }
+  }
 
-        handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr,
-               result = PYBIND11_TRY_NEXT_OVERLOAD;
+  /// When a cpp_function is GCed, release any memory allocated by pybind11
+  static void destruct(detail::function_record *rec) {
+    while (rec) {
+      detail::function_record *next = rec->next;
+      if (rec->free_data)
+        rec->free_data(rec);
+      std::free((char *)rec->name);
+      std::free((char *)rec->doc);
+      std::free((char *)rec->signature);
+      for (auto &arg : rec->args) {
+        std::free(const_cast<char *>(arg.name));
+        std::free(const_cast<char *>(arg.descr));
+        arg.value.dec_ref();
+      }
+      if (rec->def) {
+        std::free(const_cast<char *>(rec->def->ml_doc));
+        delete rec->def;
+      }
+      delete rec;
+      rec = next;
+    }
+  }
 
-        auto self_value_and_holder = value_and_holder();
-        if (overloads->is_constructor) {
-            const auto tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
-            const auto pi = reinterpret_cast<instance *>(parent.ptr());
-            self_value_and_holder = pi->get_value_and_holder(tinfo, false);
+  /// Main dispatch logic for calls to functions bound using pybind11
+  static PyObject *dispatcher(PyObject *self, PyObject *args_in,
+                              PyObject *kwargs_in) {
+    using namespace detail;
 
-            if (!self_value_and_holder.type || !self_value_and_holder.inst) {
-                PyErr_SetString(PyExc_TypeError, "__init__(self, ...) called with invalid `self` argument");
-                return nullptr;
-            }
+    /* Iterator over the list of potentially admissible overloads */
+    const function_record *overloads = (function_record *)PyCapsule_GetPointer(
+                              self, nullptr),
+                          *it = overloads;
 
-            // If this value is already registered it must mean __init__ is invoked multiple times;
-            // we really can't support that in C++, so just ignore the second __init__.
-            if (self_value_and_holder.instance_registered())
-                return none().release().ptr();
+    /* Need to know how many arguments + keyword arguments there are to pick the
+     * right overload */
+    const size_t n_args_in = (size_t)PyTuple_GET_SIZE(args_in);
+
+    handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr,
+           result = PYBIND11_TRY_NEXT_OVERLOAD;
+
+    auto self_value_and_holder = value_and_holder();
+    if (overloads->is_constructor) {
+      const auto tinfo = get_type_info((PyTypeObject *)overloads->scope.ptr());
+      const auto pi = reinterpret_cast<instance *>(parent.ptr());
+      self_value_and_holder = pi->get_value_and_holder(tinfo, false);
+
+      if (!self_value_and_holder.type || !self_value_and_holder.inst) {
+        PyErr_SetString(
+            PyExc_TypeError,
+            "__init__(self, ...) called with invalid `self` argument");
+        return nullptr;
+      }
+
+      // If this value is already registered it must mean __init__ is invoked
+      // multiple times; we really can't support that in C++, so just ignore the
+      // second __init__.
+      if (self_value_and_holder.instance_registered())
+        return none().release().ptr();
+    }
+
+    try {
+      // We do this in two passes: in the first pass, we load arguments with
+      // `convert=false`; in the second, we allow conversion (except for
+      // arguments with an explicit py::arg().noconvert()).  This lets us prefer
+      // calls without conversion, with conversion as a fallback.
+      std::vector<function_call> second_pass;
+
+      // However, if there are no overloads, we can just skip the no-convert
+      // pass entirely
+      const bool overloaded = it != nullptr && it->next != nullptr;
+
+      for (; it != nullptr; it = it->next) {
+
+        /* For each overload:
+           1. Copy all positional arguments we were given, also checking to make
+           sure that named positional arguments weren't *also* specified via
+           kwarg.
+           2. If we weren't given enough, try to make up the omitted ones by
+           checking whether they were provided by a kwarg matching the
+           `py::arg("name")` name.  If so, use it (and remove it from kwargs; if
+           not, see if the function binding provided a default that we can use.
+           3. Ensure that either all keyword arguments were "consumed", or that
+           the function takes a kwargs argument to accept unconsumed kwargs.
+           4. Any positional arguments still left get put into a tuple (for
+           args), and any leftover kwargs get put into a dict.
+           5. Pack everything into a vector; if we have py::args or py::kwargs,
+           they are an extra tuple or dict at the end of the positional
+           arguments.
+           6. Call the function call dispatcher (function_record::impl)
+
+           If one of these fail, move on to the next overload and keep trying
+           until we get a result other than PYBIND11_TRY_NEXT_OVERLOAD.
+         */
+
+        const function_record &func = *it;
+        size_t pos_args =
+            func.nargs; // Number of positional arguments that we need
+        if (func.has_args)
+          --pos_args; // (but don't count py::args
+        if (func.has_kwargs)
+          --pos_args; //  or py::kwargs)
+
+        if (!func.has_args && n_args_in > pos_args)
+          continue; // Too many arguments for this overload
+
+        if (n_args_in < pos_args && func.args.size() < pos_args)
+          continue; // Not enough arguments given, and not enough defaults to
+                    // fill in the blanks
+
+        function_call call(func, parent);
+
+        size_t args_to_copy = std::min(pos_args, n_args_in);
+        size_t args_copied = 0;
+
+        // 0. Inject new-style `self` argument
+        if (func.is_new_style_constructor) {
+          // The `value` may have been preallocated by an old-style `__init__`
+          // if it was a preceding candidate for overload resolution.
+          if (self_value_and_holder)
+            self_value_and_holder.type->dealloc(self_value_and_holder);
+
+          call.init_self = PyTuple_GET_ITEM(args_in, 0);
+          call.args.push_back(
+              reinterpret_cast<PyObject *>(&self_value_and_holder));
+          call.args_convert.push_back(false);
+          ++args_copied;
         }
 
+        // 1. Copy any position arguments given.
+        bool bad_arg = false;
+        for (; args_copied < args_to_copy; ++args_copied) {
+          const argument_record *arg_rec = args_copied < func.args.size()
+                                               ? &func.args[args_copied]
+                                               : nullptr;
+          if (kwargs_in && arg_rec && arg_rec->name &&
+              PyDict_GetItemString(kwargs_in, arg_rec->name)) {
+            bad_arg = true;
+            break;
+          }
+
+          handle arg(PyTuple_GET_ITEM(args_in, args_copied));
+          if (arg_rec && !arg_rec->none && arg.is_none()) {
+            bad_arg = true;
+            break;
+          }
+          call.args.push_back(arg);
+          call.args_convert.push_back(arg_rec ? arg_rec->convert : true);
+        }
+        if (bad_arg)
+          continue; // Maybe it was meant for another overload (issue #688)
+
+        // We'll need to copy this if we steal some kwargs for defaults
+        dict kwargs = reinterpret_borrow<dict>(kwargs_in);
+
+        // 2. Check kwargs and, failing that, defaults that may help complete
+        // the list
+        if (args_copied < pos_args) {
+          bool copied_kwargs = false;
+
+          for (; args_copied < pos_args; ++args_copied) {
+            const auto &arg = func.args[args_copied];
+
+            handle value;
+            if (kwargs_in && arg.name)
+              value = PyDict_GetItemString(kwargs.ptr(), arg.name);
+
+            if (value) {
+              // Consume a kwargs value
+              if (!copied_kwargs) {
+                kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
+                copied_kwargs = true;
+              }
+              PyDict_DelItemString(kwargs.ptr(), arg.name);
+            } else if (arg.value) {
+              value = arg.value;
+            }
+
+            if (value) {
+              call.args.push_back(value);
+              call.args_convert.push_back(arg.convert);
+            } else
+              break;
+          }
+
+          if (args_copied < pos_args)
+            continue; // Not enough arguments, defaults, or kwargs to fill the
+                      // positional arguments
+        }
+
+        // 3. Check everything was consumed (unless we have a kwargs arg)
+        if (kwargs && kwargs.size() > 0 && !func.has_kwargs)
+          continue; // Unconsumed kwargs, but no py::kwargs argument to accept
+                    // them
+
+        // 4a. If we have a py::args argument, create a new tuple with leftovers
+        if (func.has_args) {
+          tuple extra_args;
+          if (args_to_copy == 0) {
+            // We didn't copy out any position arguments from the args_in tuple,
+            // so we can reuse it directly without copying:
+            extra_args = reinterpret_borrow<tuple>(args_in);
+          } else if (args_copied >= n_args_in) {
+            extra_args = tuple(0);
+          } else {
+            size_t args_size = n_args_in - args_copied;
+            extra_args = tuple(args_size);
+            for (size_t i = 0; i < args_size; ++i) {
+              extra_args[i] = PyTuple_GET_ITEM(args_in, args_copied + i);
+            }
+          }
+          call.args.push_back(extra_args);
+          call.args_convert.push_back(false);
+          call.args_ref = std::move(extra_args);
+        }
+
+        // 4b. If we have a py::kwargs, pass on any remaining kwargs
+        if (func.has_kwargs) {
+          if (!kwargs.ptr())
+            kwargs = dict(); // If we didn't get one, send an empty one
+          call.args.push_back(kwargs);
+          call.args_convert.push_back(false);
+          call.kwargs_ref = std::move(kwargs);
+        }
+
+// 5. Put everything in a vector.  Not technically step 5, we've been building
+// it in `call.args` all along.
+#if !defined(NDEBUG)
+        if (call.args.size() != func.nargs ||
+            call.args_convert.size() != func.nargs)
+          pybind11_fail("Internal error: function call dispatcher inserted "
+                        "wrong number of arguments!");
+#endif
+
+        std::vector<bool> second_pass_convert;
+        if (overloaded) {
+          // We're in the first no-convert pass, so swap out the conversion
+          // flags for a set of all-false flags.  If the call fails, we'll swap
+          // the flags back in for the conversion-allowed call below.
+          second_pass_convert.resize(func.nargs, false);
+          call.args_convert.swap(second_pass_convert);
+        }
+
+        // 6. Call the function.
         try {
-            // We do this in two passes: in the first pass, we load arguments with `convert=false`;
-            // in the second, we allow conversion (except for arguments with an explicit
-            // py::arg().noconvert()).  This lets us prefer calls without conversion, with
-            // conversion as a fallback.
-            std::vector<function_call> second_pass;
+          loader_life_support guard{};
+          result = func.impl(call);
+        } catch (reference_cast_error &) {
+          result = PYBIND11_TRY_NEXT_OVERLOAD;
+        }
 
-            // However, if there are no overloads, we can just skip the no-convert pass entirely
-            const bool overloaded = it != nullptr && it->next != nullptr;
+        if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD)
+          break;
 
-            for (; it != nullptr; it = it->next) {
-
-                /* For each overload:
-                   1. Copy all positional arguments we were given, also checking to make sure that
-                      named positional arguments weren't *also* specified via kwarg.
-                   2. If we weren't given enough, try to make up the omitted ones by checking
-                      whether they were provided by a kwarg matching the `py::arg("name")` name.  If
-                      so, use it (and remove it from kwargs; if not, see if the function binding
-                      provided a default that we can use.
-                   3. Ensure that either all keyword arguments were "consumed", or that the function
-                      takes a kwargs argument to accept unconsumed kwargs.
-                   4. Any positional arguments still left get put into a tuple (for args), and any
-                      leftover kwargs get put into a dict.
-                   5. Pack everything into a vector; if we have py::args or py::kwargs, they are an
-                      extra tuple or dict at the end of the positional arguments.
-                   6. Call the function call dispatcher (function_record::impl)
-
-                   If one of these fail, move on to the next overload and keep trying until we get a
-                   result other than PYBIND11_TRY_NEXT_OVERLOAD.
-                 */
-
-                const function_record &func = *it;
-                size_t pos_args = func.nargs;    // Number of positional arguments that we need
-                if (func.has_args) --pos_args;   // (but don't count py::args
-                if (func.has_kwargs) --pos_args; //  or py::kwargs)
-
-                if (!func.has_args && n_args_in > pos_args)
-                    continue; // Too many arguments for this overload
-
-                if (n_args_in < pos_args && func.args.size() < pos_args)
-                    continue; // Not enough arguments given, and not enough defaults to fill in the blanks
-
-                function_call call(func, parent);
-
-                size_t args_to_copy = std::min(pos_args, n_args_in);
-                size_t args_copied = 0;
-
-                // 0. Inject new-style `self` argument
-                if (func.is_new_style_constructor) {
-                    // The `value` may have been preallocated by an old-style `__init__`
-                    // if it was a preceding candidate for overload resolution.
-                    if (self_value_and_holder)
-                        self_value_and_holder.type->dealloc(self_value_and_holder);
-
-                    call.init_self = PyTuple_GET_ITEM(args_in, 0);
-                    call.args.push_back(reinterpret_cast<PyObject *>(&self_value_and_holder));
-                    call.args_convert.push_back(false);
-                    ++args_copied;
-                }
-
-                // 1. Copy any position arguments given.
-                bool bad_arg = false;
-                for (; args_copied < args_to_copy; ++args_copied) {
-                    const argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr;
-                    if (kwargs_in && arg_rec && arg_rec->name && PyDict_GetItemString(kwargs_in, arg_rec->name)) {
-                        bad_arg = true;
-                        break;
-                    }
-
-                    handle arg(PyTuple_GET_ITEM(args_in, args_copied));
-                    if (arg_rec && !arg_rec->none && arg.is_none()) {
-                        bad_arg = true;
-                        break;
-                    }
-                    call.args.push_back(arg);
-                    call.args_convert.push_back(arg_rec ? arg_rec->convert : true);
-                }
-                if (bad_arg)
-                    continue; // Maybe it was meant for another overload (issue #688)
-
-                // We'll need to copy this if we steal some kwargs for defaults
-                dict kwargs = reinterpret_borrow<dict>(kwargs_in);
-
-                // 2. Check kwargs and, failing that, defaults that may help complete the list
-                if (args_copied < pos_args) {
-                    bool copied_kwargs = false;
-
-                    for (; args_copied < pos_args; ++args_copied) {
-                        const auto &arg = func.args[args_copied];
-
-                        handle value;
-                        if (kwargs_in && arg.name)
-                            value = PyDict_GetItemString(kwargs.ptr(), arg.name);
-
-                        if (value) {
-                            // Consume a kwargs value
-                            if (!copied_kwargs) {
-                                kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
-                                copied_kwargs = true;
-                            }
-                            PyDict_DelItemString(kwargs.ptr(), arg.name);
-                        } else if (arg.value) {
-                            value = arg.value;
-                        }
-
-                        if (value) {
-                            call.args.push_back(value);
-                            call.args_convert.push_back(arg.convert);
-                        }
-                        else
-                            break;
-                    }
-
-                    if (args_copied < pos_args)
-                        continue; // Not enough arguments, defaults, or kwargs to fill the positional arguments
-                }
-
-                // 3. Check everything was consumed (unless we have a kwargs arg)
-                if (kwargs && kwargs.size() > 0 && !func.has_kwargs)
-                    continue; // Unconsumed kwargs, but no py::kwargs argument to accept them
-
-                // 4a. If we have a py::args argument, create a new tuple with leftovers
-                if (func.has_args) {
-                    tuple extra_args;
-                    if (args_to_copy == 0) {
-                        // We didn't copy out any position arguments from the args_in tuple, so we
-                        // can reuse it directly without copying:
-                        extra_args = reinterpret_borrow<tuple>(args_in);
-                    } else if (args_copied >= n_args_in) {
-                        extra_args = tuple(0);
-                    } else {
-                        size_t args_size = n_args_in - args_copied;
-                        extra_args = tuple(args_size);
-                        for (size_t i = 0; i < args_size; ++i) {
-                            extra_args[i] = PyTuple_GET_ITEM(args_in, args_copied + i);
-                        }
-                    }
-                    call.args.push_back(extra_args);
-                    call.args_convert.push_back(false);
-                    call.args_ref = std::move(extra_args);
-                }
-
-                // 4b. If we have a py::kwargs, pass on any remaining kwargs
-                if (func.has_kwargs) {
-                    if (!kwargs.ptr())
-                        kwargs = dict(); // If we didn't get one, send an empty one
-                    call.args.push_back(kwargs);
-                    call.args_convert.push_back(false);
-                    call.kwargs_ref = std::move(kwargs);
-                }
-
-                // 5. Put everything in a vector.  Not technically step 5, we've been building it
-                // in `call.args` all along.
-                #if !defined(NDEBUG)
-                if (call.args.size() != func.nargs || call.args_convert.size() != func.nargs)
-                    pybind11_fail("Internal error: function call dispatcher inserted wrong number of arguments!");
-                #endif
-
-                std::vector<bool> second_pass_convert;
-                if (overloaded) {
-                    // We're in the first no-convert pass, so swap out the conversion flags for a
-                    // set of all-false flags.  If the call fails, we'll swap the flags back in for
-                    // the conversion-allowed call below.
-                    second_pass_convert.resize(func.nargs, false);
-                    call.args_convert.swap(second_pass_convert);
-                }
-
-                // 6. Call the function.
-                try {
-                    loader_life_support guard{};
-                    result = func.impl(call);
-                } catch (reference_cast_error &) {
-                    result = PYBIND11_TRY_NEXT_OVERLOAD;
-                }
-
-                if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD)
-                    break;
-
-                if (overloaded) {
-                    // The (overloaded) call failed; if the call has at least one argument that
-                    // permits conversion (i.e. it hasn't been explicitly specified `.noconvert()`)
-                    // then add this call to the list of second pass overloads to try.
-                    for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) {
-                        if (second_pass_convert[i]) {
-                            // Found one: swap the converting flags back in and store the call for
-                            // the second pass.
-                            call.args_convert.swap(second_pass_convert);
-                            second_pass.push_back(std::move(call));
-                            break;
-                        }
-                    }
-                }
+        if (overloaded) {
+          // The (overloaded) call failed; if the call has at least one argument
+          // that permits conversion (i.e. it hasn't been explicitly specified
+          // `.noconvert()`) then add this call to the list of second pass
+          // overloads to try.
+          for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) {
+            if (second_pass_convert[i]) {
+              // Found one: swap the converting flags back in and store the call
+              // for the second pass.
+              call.args_convert.swap(second_pass_convert);
+              second_pass.push_back(std::move(call));
+              break;
             }
+          }
+        }
+      }
 
-            if (overloaded && !second_pass.empty() && result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
-                // The no-conversion pass finished without success, try again with conversion allowed
-                for (auto &call : second_pass) {
-                    try {
-                        loader_life_support guard{};
-                        result = call.func.impl(call);
-                    } catch (reference_cast_error &) {
-                        result = PYBIND11_TRY_NEXT_OVERLOAD;
-                    }
+      if (overloaded && !second_pass.empty() &&
+          result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+        // The no-conversion pass finished without success, try again with
+        // conversion allowed
+        for (auto &call : second_pass) {
+          try {
+            loader_life_support guard{};
+            result = call.func.impl(call);
+          } catch (reference_cast_error &) {
+            result = PYBIND11_TRY_NEXT_OVERLOAD;
+          }
 
-                    if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
-                        // The error reporting logic below expects 'it' to be valid, as it would be
-                        // if we'd encountered this failure in the first-pass loop.
-                        if (!result)
-                            it = &call.func;
-                        break;
-                    }
-                }
-            }
-        } catch (error_already_set &e) {
-            e.restore();
-            return nullptr;
+          if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+            // The error reporting logic below expects 'it' to be valid, as it
+            // would be if we'd encountered this failure in the first-pass loop.
+            if (!result)
+              it = &call.func;
+            break;
+          }
+        }
+      }
+    } catch (error_already_set &e) {
+      e.restore();
+      return nullptr;
 #if defined(__GNUG__) && !defined(__clang__)
-        } catch ( abi::__forced_unwind& ) {
-            throw;
+    } catch (abi::__forced_unwind &) {
+      throw;
 #endif
+    } catch (...) {
+      /* When an exception is caught, give each registered exception
+         translator a chance to translate it to a Python exception
+         in reverse order of registration.
+
+         A translator may choose to do one of the following:
+
+          - catch the exception and call PyErr_SetString or PyErr_SetObject
+            to set a standard (or custom) Python exception, or
+          - do nothing and let the exception fall through to the next
+         translator, or
+          - delegate translation to the next translator by throwing a new type
+         of exception. */
+
+      auto last_exception = std::current_exception();
+      auto &registered_exception_translators =
+          get_internals().registered_exception_translators;
+      for (auto &translator : registered_exception_translators) {
+        try {
+          translator(last_exception);
         } catch (...) {
-            /* When an exception is caught, give each registered exception
-               translator a chance to translate it to a Python exception
-               in reverse order of registration.
-
-               A translator may choose to do one of the following:
-
-                - catch the exception and call PyErr_SetString or PyErr_SetObject
-                  to set a standard (or custom) Python exception, or
-                - do nothing and let the exception fall through to the next translator, or
-                - delegate translation to the next translator by throwing a new type of exception. */
-
-            auto last_exception = std::current_exception();
-            auto &registered_exception_translators = get_internals().registered_exception_translators;
-            for (auto& translator : registered_exception_translators) {
-                try {
-                    translator(last_exception);
-                } catch (...) {
-                    last_exception = std::current_exception();
-                    continue;
-                }
-                return nullptr;
-            }
-            PyErr_SetString(PyExc_SystemError, "Exception escaped from default exception translator!");
-            return nullptr;
-        }
-
-        auto append_note_if_missing_header_is_suspected = [](std::string &msg) {
-            if (msg.find("std::") != std::string::npos) {
-                msg += "\n\n"
-                       "Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
-                       "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
-                       "conversions are optional and require extra headers to be included\n"
-                       "when compiling your pybind11 module.";
-            }
-        };
-
-        if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
-            if (overloads->is_operator)
-                return handle(Py_NotImplemented).inc_ref().ptr();
-
-            std::string msg = std::string(overloads->name) + "(): incompatible " +
-                std::string(overloads->is_constructor ? "constructor" : "function") +
-                " arguments. The following argument types are supported:\n";
-
-            int ctr = 0;
-            for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) {
-                msg += "    "+ std::to_string(++ctr) + ". ";
-
-                bool wrote_sig = false;
-                if (overloads->is_constructor) {
-                    // For a constructor, rewrite `(self: Object, arg0, ...) -> NoneType` as `Object(arg0, ...)`
-                    std::string sig = it2->signature;
-                    size_t start = sig.find('(') + 7; // skip "(self: "
-                    if (start < sig.size()) {
-                        // End at the , for the next argument
-                        size_t end = sig.find(", "), next = end + 2;
-                        size_t ret = sig.rfind(" -> ");
-                        // Or the ), if there is no comma:
-                        if (end >= sig.size()) next = end = sig.find(')');
-                        if (start < end && next < sig.size()) {
-                            msg.append(sig, start, end - start);
-                            msg += '(';
-                            msg.append(sig, next, ret - next);
-                            wrote_sig = true;
-                        }
-                    }
-                }
-                if (!wrote_sig) msg += it2->signature;
-
-                msg += "\n";
-            }
-            msg += "\nInvoked with: ";
-            auto args_ = reinterpret_borrow<tuple>(args_in);
-            bool some_args = false;
-            for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size(); ++ti) {
-                if (!some_args) some_args = true;
-                else msg += ", ";
-                msg += pybind11::repr(args_[ti]);
-            }
-            if (kwargs_in) {
-                auto kwargs = reinterpret_borrow<dict>(kwargs_in);
-                if (kwargs.size() > 0) {
-                    if (some_args) msg += "; ";
-                    msg += "kwargs: ";
-                    bool first = true;
-                    for (auto kwarg : kwargs) {
-                        if (first) first = false;
-                        else msg += ", ";
-                        msg += pybind11::str("{}={!r}").format(kwarg.first, kwarg.second);
-                    }
-                }
-            }
-
-            append_note_if_missing_header_is_suspected(msg);
-            PyErr_SetString(PyExc_TypeError, msg.c_str());
-            return nullptr;
-        } else if (!result) {
-            std::string msg = "Unable to convert function return value to a "
-                              "Python type! The signature was\n\t";
-            msg += it->signature;
-            append_note_if_missing_header_is_suspected(msg);
-            PyErr_SetString(PyExc_TypeError, msg.c_str());
-            return nullptr;
-        } else {
-            if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
-                auto *pi = reinterpret_cast<instance *>(parent.ptr());
-                self_value_and_holder.type->init_instance(pi, nullptr);
-            }
-            return result.ptr();
+          last_exception = std::current_exception();
+          continue;
         }
+        return nullptr;
+      }
+      PyErr_SetString(PyExc_SystemError,
+                      "Exception escaped from default exception translator!");
+      return nullptr;
     }
+
+    auto append_note_if_missing_header_is_suspected = [](std::string &msg) {
+      if (msg.find("std::") != std::string::npos) {
+        msg += "\n\n"
+               "Did you forget to `#include <pybind11/stl.h>`? Or "
+               "<pybind11/complex.h>,\n"
+               "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some "
+               "automatic\n"
+               "conversions are optional and require extra headers to be "
+               "included\n"
+               "when compiling your pybind11 module.";
+      }
+    };
+
+    if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+      if (overloads->is_operator)
+        return handle(Py_NotImplemented).inc_ref().ptr();
+
+      std::string msg =
+          std::string(overloads->name) + "(): incompatible " +
+          std::string(overloads->is_constructor ? "constructor" : "function") +
+          " arguments. The following argument types are supported:\n";
+
+      int ctr = 0;
+      for (const function_record *it2 = overloads; it2 != nullptr;
+           it2 = it2->next) {
+        msg += "    " + std::to_string(++ctr) + ". ";
+
+        bool wrote_sig = false;
+        if (overloads->is_constructor) {
+          // For a constructor, rewrite `(self: Object, arg0, ...) -> NoneType`
+          // as `Object(arg0, ...)`
+          std::string sig = it2->signature;
+          size_t start = sig.find('(') + 7; // skip "(self: "
+          if (start < sig.size()) {
+            // End at the , for the next argument
+            size_t end = sig.find(", "), next = end + 2;
+            size_t ret = sig.rfind(" -> ");
+            // Or the ), if there is no comma:
+            if (end >= sig.size())
+              next = end = sig.find(')');
+            if (start < end && next < sig.size()) {
+              msg.append(sig, start, end - start);
+              msg += '(';
+              msg.append(sig, next, ret - next);
+              wrote_sig = true;
+            }
+          }
+        }
+        if (!wrote_sig)
+          msg += it2->signature;
+
+        msg += "\n";
+      }
+      msg += "\nInvoked with: ";
+      auto args_ = reinterpret_borrow<tuple>(args_in);
+      bool some_args = false;
+      for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size();
+           ++ti) {
+        if (!some_args)
+          some_args = true;
+        else
+          msg += ", ";
+        msg += pybind11::repr(args_[ti]);
+      }
+      if (kwargs_in) {
+        auto kwargs = reinterpret_borrow<dict>(kwargs_in);
+        if (kwargs.size() > 0) {
+          if (some_args)
+            msg += "; ";
+          msg += "kwargs: ";
+          bool first = true;
+          for (auto kwarg : kwargs) {
+            if (first)
+              first = false;
+            else
+              msg += ", ";
+            msg += pybind11::str("{}={!r}").format(kwarg.first, kwarg.second);
+          }
+        }
+      }
+
+      append_note_if_missing_header_is_suspected(msg);
+      PyErr_SetString(PyExc_TypeError, msg.c_str());
+      return nullptr;
+    } else if (!result) {
+      std::string msg = "Unable to convert function return value to a "
+                        "Python type! The signature was\n\t";
+      msg += it->signature;
+      append_note_if_missing_header_is_suspected(msg);
+      PyErr_SetString(PyExc_TypeError, msg.c_str());
+      return nullptr;
+    } else {
+      if (overloads->is_constructor &&
+          !self_value_and_holder.holder_constructed()) {
+        auto *pi = reinterpret_cast<instance *>(parent.ptr());
+        self_value_and_holder.type->init_instance(pi, nullptr);
+      }
+      return result.ptr();
+    }
+  }
 };
 
 /// Wrapper for Python extension modules
 class module : public object {
 public:
-    PYBIND11_OBJECT_DEFAULT(module, object, PyModule_Check)
+  PYBIND11_OBJECT_DEFAULT(module, object, PyModule_Check)
 
-    /// Create a new top-level Python module with the given name and docstring
-    explicit module(const char *name, const char *doc = nullptr) {
-        if (!options::show_user_defined_docstrings()) doc = nullptr;
+  /// Create a new top-level Python module with the given name and docstring
+  explicit module(const char *name, const char *doc = nullptr) {
+    if (!options::show_user_defined_docstrings())
+      doc = nullptr;
 #if PY_MAJOR_VERSION >= 3
-        PyModuleDef *def = new PyModuleDef();
-        std::memset(def, 0, sizeof(PyModuleDef));
-        def->m_name = name;
-        def->m_doc = doc;
-        def->m_size = -1;
-        Py_INCREF(def);
-        m_ptr = PyModule_Create(def);
+    PyModuleDef *def = new PyModuleDef();
+    std::memset(def, 0, sizeof(PyModuleDef));
+    def->m_name = name;
+    def->m_doc = doc;
+    def->m_size = -1;
+    Py_INCREF(def);
+    m_ptr = PyModule_Create(def);
 #else
-        m_ptr = Py_InitModule3(name, nullptr, doc);
+    m_ptr = Py_InitModule3(name, nullptr, doc);
 #endif
-        if (m_ptr == nullptr)
-            pybind11_fail("Internal error in module::module()");
-        inc_ref();
-    }
+    if (m_ptr == nullptr)
+      pybind11_fail("Internal error in module::module()");
+    inc_ref();
+  }
 
-    /** \rst
-        Create Python binding for a new function within the module scope. ``Func``
-        can be a plain C++ function, a function pointer, or a lambda function. For
-        details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
-    \endrst */
-    template <typename Func, typename... Extra>
-    module &def(const char *name_, Func &&f, const Extra& ... extra) {
-        cpp_function func(std::forward<Func>(f), name(name_), scope(*this),
-                          sibling(getattr(*this, name_, none())), extra...);
-        // NB: allow overwriting here because cpp_function sets up a chain with the intention of
-        // overwriting (and has already checked internally that it isn't overwriting non-functions).
-        add_object(name_, func, true /* overwrite */);
-        return *this;
-    }
+  /** \rst
+      Create Python binding for a new function within the module scope. ``Func``
+      can be a plain C++ function, a function pointer, or a lambda function. For
+      details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
+  \endrst */
+  template <typename Func, typename... Extra>
+  module &def(const char *name_, Func &&f, const Extra &... extra) {
+    cpp_function func(std::forward<Func>(f), name(name_), scope(*this),
+                      sibling(getattr(*this, name_, none())), extra...);
+    // NB: allow overwriting here because cpp_function sets up a chain with the
+    // intention of overwriting (and has already checked internally that it
+    // isn't overwriting non-functions).
+    add_object(name_, func, true /* overwrite */);
+    return *this;
+  }
 
-    /** \rst
-        Create and return a new Python submodule with the given name and docstring.
-        This also works recursively, i.e.
+  /** \rst
+      Create and return a new Python submodule with the given name and
+  docstring. This also works recursively, i.e.
 
-        .. code-block:: cpp
+      .. code-block:: cpp
 
-            py::module m("example", "pybind11 example plugin");
-            py::module m2 = m.def_submodule("sub", "A submodule of 'example'");
-            py::module m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
-    \endrst */
-    module def_submodule(const char *name, const char *doc = nullptr) {
-        std::string full_name = std::string(PyModule_GetName(m_ptr))
-            + std::string(".") + std::string(name);
-        auto result = reinterpret_borrow<module>(PyImport_AddModule(full_name.c_str()));
-        if (doc && options::show_user_defined_docstrings())
-            result.attr("__doc__") = pybind11::str(doc);
-        attr(name) = result;
-        return result;
-    }
+          py::module m("example", "pybind11 example plugin");
+          py::module m2 = m.def_submodule("sub", "A submodule of 'example'");
+          py::module m3 = m2.def_submodule("subsub", "A submodule of
+  'example.sub'"); \endrst */
+  module def_submodule(const char *name, const char *doc = nullptr) {
+    std::string full_name = std::string(PyModule_GetName(m_ptr)) +
+                            std::string(".") + std::string(name);
+    auto result =
+        reinterpret_borrow<module>(PyImport_AddModule(full_name.c_str()));
+    if (doc && options::show_user_defined_docstrings())
+      result.attr("__doc__") = pybind11::str(doc);
+    attr(name) = result;
+    return result;
+  }
 
-    /// Import and return a module or throws `error_already_set`.
-    static module import(const char *name) {
-        PyObject *obj = PyImport_ImportModule(name);
-        if (!obj)
-            throw error_already_set();
-        return reinterpret_steal<module>(obj);
-    }
+  /// Import and return a module or throws `error_already_set`.
+  static module import(const char *name) {
+    PyObject *obj = PyImport_ImportModule(name);
+    if (!obj)
+      throw error_already_set();
+    return reinterpret_steal<module>(obj);
+  }
 
-    /// Reload the module or throws `error_already_set`.
-    void reload() {
-        PyObject *obj = PyImport_ReloadModule(ptr());
-        if (!obj)
-            throw error_already_set();
-        *this = reinterpret_steal<module>(obj);
-    }
+  /// Reload the module or throws `error_already_set`.
+  void reload() {
+    PyObject *obj = PyImport_ReloadModule(ptr());
+    if (!obj)
+      throw error_already_set();
+    *this = reinterpret_steal<module>(obj);
+  }
 
-    // Adds an object to the module using the given name.  Throws if an object with the given name
-    // already exists.
-    //
-    // overwrite should almost always be false: attempting to overwrite objects that pybind11 has
-    // established will, in most cases, break things.
-    PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) {
-        if (!overwrite && hasattr(*this, name))
-            pybind11_fail("Error during initialization: multiple incompatible definitions with name \"" +
+  // Adds an object to the module using the given name.  Throws if an object
+  // with the given name already exists.
+  //
+  // overwrite should almost always be false: attempting to overwrite objects
+  // that pybind11 has established will, in most cases, break things.
+  PYBIND11_NOINLINE void add_object(const char *name, handle obj,
+                                    bool overwrite = false) {
+    if (!overwrite && hasattr(*this, name))
+      pybind11_fail("Error during initialization: multiple incompatible "
+                    "definitions with name \"" +
                     std::string(name) + "\"");
 
-        PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */);
-    }
+    PyModule_AddObject(ptr(), name,
+                       obj.inc_ref().ptr() /* steals a reference */);
+  }
 };
 
 /// \ingroup python_builtins
-/// Return a dictionary representing the global variables in the current execution frame,
-/// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded).
+/// Return a dictionary representing the global variables in the current
+/// execution frame, or ``__main__.__dict__`` if there is no frame (usually when
+/// the interpreter is embedded).
 inline dict globals() {
-    PyObject *p = PyEval_GetGlobals();
-    return reinterpret_borrow<dict>(p ? p : module::import("__main__").attr("__dict__").ptr());
+  PyObject *p = PyEval_GetGlobals();
+  return reinterpret_borrow<dict>(
+      p ? p : module::import("__main__").attr("__dict__").ptr());
 }
 
 NAMESPACE_BEGIN(detail)
 /// Generic support for creating new Python heap types
 class generic_type : public object {
-    template <typename...> friend class class_;
+  template <typename...> friend class class_;
+
 public:
-    PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
+  PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
 protected:
-    void initialize(const type_record &rec) {
-        if (rec.scope && hasattr(rec.scope, rec.name))
-            pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name) +
-                          "\": an object with that name is already defined");
+  void initialize(const type_record &rec) {
+    if (rec.scope && hasattr(rec.scope, rec.name))
+      pybind11_fail("generic_type: cannot initialize type \"" +
+                    std::string(rec.name) +
+                    "\": an object with that name is already defined");
 
-        if (rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
-            pybind11_fail("generic_type: type \"" + std::string(rec.name) +
-                          "\" is already registered!");
+    if (rec.module_local ? get_local_type_info(*rec.type)
+                         : get_global_type_info(*rec.type))
+      pybind11_fail("generic_type: type \"" + std::string(rec.name) +
+                    "\" is already registered!");
 
-        m_ptr = make_new_python_type(rec);
+    m_ptr = make_new_python_type(rec);
 
-        /* Register supplemental type information in C++ dict */
-        auto *tinfo = new detail::type_info();
-        tinfo->type = (PyTypeObject *) m_ptr;
-        tinfo->cpptype = rec.type;
-        tinfo->type_size = rec.type_size;
-        tinfo->type_align = rec.type_align;
-        tinfo->operator_new = rec.operator_new;
-        tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size);
-        tinfo->init_instance = rec.init_instance;
-        tinfo->dealloc = rec.dealloc;
-        tinfo->simple_type = true;
-        tinfo->simple_ancestors = true;
-        tinfo->default_holder = rec.default_holder;
-        tinfo->module_local = rec.module_local;
+    /* Register supplemental type information in C++ dict */
+    auto *tinfo = new detail::type_info();
+    tinfo->type = (PyTypeObject *)m_ptr;
+    tinfo->cpptype = rec.type;
+    tinfo->type_size = rec.type_size;
+    tinfo->type_align = rec.type_align;
+    tinfo->operator_new = rec.operator_new;
+    tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size);
+    tinfo->init_instance = rec.init_instance;
+    tinfo->dealloc = rec.dealloc;
+    tinfo->simple_type = true;
+    tinfo->simple_ancestors = true;
+    tinfo->default_holder = rec.default_holder;
+    tinfo->module_local = rec.module_local;
 
-        auto &internals = get_internals();
-        auto tindex = std::type_index(*rec.type);
-        tinfo->direct_conversions = &internals.direct_conversions[tindex];
-        if (rec.module_local)
-            registered_local_types_cpp()[tindex] = tinfo;
-        else
-            internals.registered_types_cpp[tindex] = tinfo;
-        internals.registered_types_py[(PyTypeObject *) m_ptr] = { tinfo };
+    auto &internals = get_internals();
+    auto tindex = std::type_index(*rec.type);
+    tinfo->direct_conversions = &internals.direct_conversions[tindex];
+    if (rec.module_local)
+      registered_local_types_cpp()[tindex] = tinfo;
+    else
+      internals.registered_types_cpp[tindex] = tinfo;
+    internals.registered_types_py[(PyTypeObject *)m_ptr] = {tinfo};
 
-        if (rec.bases.size() > 1 || rec.multiple_inheritance) {
-            mark_parents_nonsimple(tinfo->type);
-            tinfo->simple_ancestors = false;
-        }
-        else if (rec.bases.size() == 1) {
-            auto parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
-            tinfo->simple_ancestors = parent_tinfo->simple_ancestors;
-        }
-
-        if (rec.module_local) {
-            // Stash the local typeinfo and loader so that external modules can access it.
-            tinfo->module_local_load = &type_caster_generic::local_load;
-            setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo));
-        }
+    if (rec.bases.size() > 1 || rec.multiple_inheritance) {
+      mark_parents_nonsimple(tinfo->type);
+      tinfo->simple_ancestors = false;
+    } else if (rec.bases.size() == 1) {
+      auto parent_tinfo = get_type_info((PyTypeObject *)rec.bases[0].ptr());
+      tinfo->simple_ancestors = parent_tinfo->simple_ancestors;
     }
 
-    /// Helper function which tags all parents of a type using mult. inheritance
-    void mark_parents_nonsimple(PyTypeObject *value) {
-        auto t = reinterpret_borrow<tuple>(value->tp_bases);
-        for (handle h : t) {
-            auto tinfo2 = get_type_info((PyTypeObject *) h.ptr());
-            if (tinfo2)
-                tinfo2->simple_type = false;
-            mark_parents_nonsimple((PyTypeObject *) h.ptr());
-        }
+    if (rec.module_local) {
+      // Stash the local typeinfo and loader so that external modules can access
+      // it.
+      tinfo->module_local_load = &type_caster_generic::local_load;
+      setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo));
     }
+  }
 
-    void install_buffer_funcs(
-            buffer_info *(*get_buffer)(PyObject *, void *),
-            void *get_buffer_data) {
-        PyHeapTypeObject *type = (PyHeapTypeObject*) m_ptr;
-        auto tinfo = detail::get_type_info(&type->ht_type);
-
-        if (!type->ht_type.tp_as_buffer)
-            pybind11_fail(
-                "To be able to register buffer protocol support for the type '" +
-                std::string(tinfo->type->tp_name) +
-                "' the associated class<>(..) invocation must "
-                "include the pybind11::buffer_protocol() annotation!");
-
-        tinfo->get_buffer = get_buffer;
-        tinfo->get_buffer_data = get_buffer_data;
+  /// Helper function which tags all parents of a type using mult. inheritance
+  void mark_parents_nonsimple(PyTypeObject *value) {
+    auto t = reinterpret_borrow<tuple>(value->tp_bases);
+    for (handle h : t) {
+      auto tinfo2 = get_type_info((PyTypeObject *)h.ptr());
+      if (tinfo2)
+        tinfo2->simple_type = false;
+      mark_parents_nonsimple((PyTypeObject *)h.ptr());
     }
+  }
 
-    // rec_func must be set for either fget or fset.
-    void def_property_static_impl(const char *name,
-                                  handle fget, handle fset,
-                                  detail::function_record *rec_func) {
-        const auto is_static = rec_func && !(rec_func->is_method && rec_func->scope);
-        const auto has_doc = rec_func && rec_func->doc && pybind11::options::show_user_defined_docstrings();
-        auto property = handle((PyObject *) (is_static ? get_internals().static_property_type
-                                                       : &PyProperty_Type));
-        attr(name) = property(fget.ptr() ? fget : none(),
-                              fset.ptr() ? fset : none(),
-                              /*deleter*/none(),
-                              pybind11::str(has_doc ? rec_func->doc : ""));
-    }
+  void install_buffer_funcs(buffer_info *(*get_buffer)(PyObject *, void *),
+                            void *get_buffer_data) {
+    PyHeapTypeObject *type = (PyHeapTypeObject *)m_ptr;
+    auto tinfo = detail::get_type_info(&type->ht_type);
+
+    if (!type->ht_type.tp_as_buffer)
+      pybind11_fail(
+          "To be able to register buffer protocol support for the type '" +
+          std::string(tinfo->type->tp_name) +
+          "' the associated class<>(..) invocation must "
+          "include the pybind11::buffer_protocol() annotation!");
+
+    tinfo->get_buffer = get_buffer;
+    tinfo->get_buffer_data = get_buffer_data;
+  }
+
+  // rec_func must be set for either fget or fset.
+  void def_property_static_impl(const char *name, handle fget, handle fset,
+                                detail::function_record *rec_func) {
+    const auto is_static =
+        rec_func && !(rec_func->is_method && rec_func->scope);
+    const auto has_doc = rec_func && rec_func->doc &&
+                         pybind11::options::show_user_defined_docstrings();
+    auto property =
+        handle((PyObject *)(is_static ? get_internals().static_property_type
+                                      : &PyProperty_Type));
+    attr(name) = property(
+        fget.ptr() ? fget : none(), fset.ptr() ? fset : none(),
+        /*deleter*/ none(), pybind11::str(has_doc ? rec_func->doc : ""));
+  }
 };
 
-/// Set the pointer to operator new if it exists. The cast is needed because it can be overloaded.
-template <typename T, typename = void_t<decltype(static_cast<void *(*)(size_t)>(T::operator new))>>
-void set_operator_new(type_record *r) { r->operator_new = &T::operator new; }
+/// Set the pointer to operator new if it exists. The cast is needed because it
+/// can be overloaded.
+template <typename T, typename = void_t<decltype(
+                          static_cast<void *(*)(size_t)>(T::operator new))>>
+void set_operator_new(type_record *r) {
+  r->operator_new = &T::operator new;
+}
 
-template <typename> void set_operator_new(...) { }
+template <typename> void set_operator_new(...) {}
 
-template <typename T, typename SFINAE = void> struct has_operator_delete : std::false_type { };
-template <typename T> struct has_operator_delete<T, void_t<decltype(static_cast<void (*)(void *)>(T::operator delete))>>
-    : std::true_type { };
-template <typename T, typename SFINAE = void> struct has_operator_delete_size : std::false_type { };
-template <typename T> struct has_operator_delete_size<T, void_t<decltype(static_cast<void (*)(void *, size_t)>(T::operator delete))>>
-    : std::true_type { };
-/// Call class-specific delete if it exists or global otherwise. Can also be an overload set.
+template <typename T, typename SFINAE = void>
+struct has_operator_delete : std::false_type {};
+template <typename T>
+struct has_operator_delete<
+    T, void_t<decltype(static_cast<void (*)(void *)>(T::operator delete))>>
+    : std::true_type {};
+template <typename T, typename SFINAE = void>
+struct has_operator_delete_size : std::false_type {};
+template <typename T>
+struct has_operator_delete_size<
+    T,
+    void_t<decltype(static_cast<void (*)(void *, size_t)>(T::operator delete))>>
+    : std::true_type {};
+/// Call class-specific delete if it exists or global otherwise. Can also be an
+/// overload set.
 template <typename T, enable_if_t<has_operator_delete<T>::value, int> = 0>
-void call_operator_delete(T *p, size_t, size_t) { T::operator delete(p); }
-template <typename T, enable_if_t<!has_operator_delete<T>::value && has_operator_delete_size<T>::value, int> = 0>
-void call_operator_delete(T *p, size_t s, size_t) { T::operator delete(p, s); }
+void call_operator_delete(T *p, size_t, size_t) {
+  T::operator delete(p);
+}
+template <typename T, enable_if_t<!has_operator_delete<T>::value &&
+                                      has_operator_delete_size<T>::value,
+                                  int> = 0>
+void call_operator_delete(T *p, size_t s, size_t) {
+  T::operator delete(p, s);
+}
 
 inline void call_operator_delete(void *p, size_t s, size_t a) {
-    (void)s; (void)a;
+  (void)s;
+  (void)a;
 #if defined(PYBIND11_CPP17)
-    if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
-        ::operator delete(p, s, std::align_val_t(a));
-    else
-        ::operator delete(p, s);
+  if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
+    ::operator delete(p, s, std::align_val_t(a));
+  else
+    ::operator delete(p, s);
 #else
-    ::operator delete(p);
+  ::operator delete(p);
 #endif
 }
 
@@ -1019,517 +1156,617 @@ NAMESPACE_END(detail)
 /// Given a pointer to a member function, cast it to its `Derived` version.
 /// Forward everything else unchanged.
 template <typename /*Derived*/, typename F>
-auto method_adaptor(F &&f) -> decltype(std::forward<F>(f)) { return std::forward<F>(f); }
-
-template <typename Derived, typename Return, typename Class, typename... Args>
-auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) {
-    static_assert(detail::is_accessible_base_of<Class, Derived>::value,
-        "Cannot bind an inaccessible base class method; use a lambda definition instead");
-    return pmf;
+auto method_adaptor(F &&f) -> decltype(std::forward<F>(f)) {
+  return std::forward<F>(f);
 }
 
 template <typename Derived, typename Return, typename Class, typename... Args>
-auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const {
-    static_assert(detail::is_accessible_base_of<Class, Derived>::value,
-        "Cannot bind an inaccessible base class method; use a lambda definition instead");
-    return pmf;
+auto method_adaptor(Return (Class::*pmf)(Args...))
+    -> Return (Derived::*)(Args...) {
+  static_assert(detail::is_accessible_base_of<Class, Derived>::value,
+                "Cannot bind an inaccessible base class method; use a lambda "
+                "definition instead");
+  return pmf;
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...) const)
+    -> Return (Derived::*)(Args...) const {
+  static_assert(detail::is_accessible_base_of<Class, Derived>::value,
+                "Cannot bind an inaccessible base class method; use a lambda "
+                "definition instead");
+  return pmf;
 }
 
 template <typename type_, typename... options>
 class class_ : public detail::generic_type {
-    template <typename T> using is_holder = detail::is_holder_type<type_, T>;
-    template <typename T> using is_subtype = detail::is_strict_base_of<type_, T>;
-    template <typename T> using is_base = detail::is_strict_base_of<T, type_>;
-    // struct instead of using here to help MSVC:
-    template <typename T> struct is_valid_class_option :
-        detail::any_of<is_holder<T>, is_subtype<T>, is_base<T>> {};
+  template <typename T> using is_holder = detail::is_holder_type<type_, T>;
+  template <typename T> using is_subtype = detail::is_strict_base_of<type_, T>;
+  template <typename T> using is_base = detail::is_strict_base_of<T, type_>;
+  // struct instead of using here to help MSVC:
+  template <typename T>
+  struct is_valid_class_option
+      : detail::any_of<is_holder<T>, is_subtype<T>, is_base<T>> {};
 
 public:
-    using type = type_;
-    using type_alias = detail::exactly_one_t<is_subtype, void, options...>;
-    constexpr static bool has_alias = !std::is_void<type_alias>::value;
-    using holder_type = detail::exactly_one_t<is_holder, std::unique_ptr<type>, options...>;
+  using type = type_;
+  using type_alias = detail::exactly_one_t<is_subtype, void, options...>;
+  constexpr static bool has_alias = !std::is_void<type_alias>::value;
+  using holder_type =
+      detail::exactly_one_t<is_holder, std::unique_ptr<type>, options...>;
 
-    static_assert(detail::all_of<is_valid_class_option<options>...>::value,
-            "Unknown/invalid class_ template parameters provided");
+  static_assert(detail::all_of<is_valid_class_option<options>...>::value,
+                "Unknown/invalid class_ template parameters provided");
 
-    static_assert(!has_alias || std::is_polymorphic<type>::value,
-            "Cannot use an alias class with a non-polymorphic type");
+  static_assert(!has_alias || std::is_polymorphic<type>::value,
+                "Cannot use an alias class with a non-polymorphic type");
 
-    PYBIND11_OBJECT(class_, generic_type, PyType_Check)
+  PYBIND11_OBJECT(class_, generic_type, PyType_Check)
 
-    template <typename... Extra>
-    class_(handle scope, const char *name, const Extra &... extra) {
-        using namespace detail;
+  template <typename... Extra>
+  class_(handle scope, const char *name, const Extra &... extra) {
+    using namespace detail;
 
-        // MI can only be specified via class_ template options, not constructor parameters
-        static_assert(
-            none_of<is_pyobject<Extra>...>::value || // no base class arguments, or:
-            (   constexpr_sum(is_pyobject<Extra>::value...) == 1 && // Exactly one base
-                constexpr_sum(is_base<options>::value...)   == 0 && // no template option bases
-                none_of<std::is_same<multiple_inheritance, Extra>...>::value), // no multiple_inheritance attr
-            "Error: multiple inheritance bases must be specified via class_ template options");
+    // MI can only be specified via class_ template options, not constructor
+    // parameters
+    static_assert(
+        none_of<is_pyobject<Extra>...>::value || // no base class arguments, or:
+            (constexpr_sum(is_pyobject<Extra>::value...) ==
+                 1 && // Exactly one base
+             constexpr_sum(is_base<options>::value...) ==
+                 0 && // no template option bases
+             none_of<std::is_same<multiple_inheritance, Extra>...>::
+                 value), // no multiple_inheritance attr
+        "Error: multiple inheritance bases must be specified via class_ "
+        "template options");
 
-        type_record record;
-        record.scope = scope;
-        record.name = name;
-        record.type = &typeid(type);
-        record.type_size = sizeof(conditional_t<has_alias, type_alias, type>);
-        record.type_align = alignof(conditional_t<has_alias, type_alias, type>&);
-        record.holder_size = sizeof(holder_type);
-        record.init_instance = init_instance;
-        record.dealloc = dealloc;
-        record.default_holder = detail::is_instantiation<std::unique_ptr, holder_type>::value;
+    type_record record;
+    record.scope = scope;
+    record.name = name;
+    record.type = &typeid(type);
+    record.type_size = sizeof(conditional_t<has_alias, type_alias, type>);
+    record.type_align = alignof(conditional_t<has_alias, type_alias, type> &);
+    record.holder_size = sizeof(holder_type);
+    record.init_instance = init_instance;
+    record.dealloc = dealloc;
+    record.default_holder =
+        detail::is_instantiation<std::unique_ptr, holder_type>::value;
 
-        set_operator_new<type>(&record);
+    set_operator_new<type>(&record);
 
-        /* Register base classes specified via template arguments to class_, if any */
-        PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
+    /* Register base classes specified via template arguments to class_, if any
+     */
+    PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
 
-        /* Process optional arguments, if any */
-        process_attributes<Extra...>::init(extra..., &record);
+    /* Process optional arguments, if any */
+    process_attributes<Extra...>::init(extra..., &record);
 
-        generic_type::initialize(record);
+    generic_type::initialize(record);
 
-        if (has_alias) {
-            auto &instances = record.module_local ? registered_local_types_cpp() : get_internals().registered_types_cpp;
-            instances[std::type_index(typeid(type_alias))] = instances[std::type_index(typeid(type))];
-        }
+    if (has_alias) {
+      auto &instances = record.module_local
+                            ? registered_local_types_cpp()
+                            : get_internals().registered_types_cpp;
+      instances[std::type_index(typeid(type_alias))] =
+          instances[std::type_index(typeid(type))];
     }
+  }
 
-    template <typename Base, detail::enable_if_t<is_base<Base>::value, int> = 0>
-    static void add_base(detail::type_record &rec) {
-        rec.add_base(typeid(Base), [](void *src) -> void * {
-            return static_cast<Base *>(reinterpret_cast<type *>(src));
-        });
-    }
+  template <typename Base, detail::enable_if_t<is_base<Base>::value, int> = 0>
+  static void add_base(detail::type_record &rec) {
+    rec.add_base(typeid(Base), [](void *src) -> void * {
+      return static_cast<Base *>(reinterpret_cast<type *>(src));
+    });
+  }
 
-    template <typename Base, detail::enable_if_t<!is_base<Base>::value, int> = 0>
-    static void add_base(detail::type_record &) { }
+  template <typename Base, detail::enable_if_t<!is_base<Base>::value, int> = 0>
+  static void add_base(detail::type_record &) {}
 
-    template <typename Func, typename... Extra>
-    class_ &def(const char *name_, Func&& f, const Extra&... extra) {
-        cpp_function cf(method_adaptor<type>(std::forward<Func>(f)), name(name_), is_method(*this),
-                        sibling(getattr(*this, name_, none())), extra...);
-        attr(cf.name()) = cf;
-        return *this;
-    }
+  template <typename Func, typename... Extra>
+  class_ &def(const char *name_, Func &&f, const Extra &... extra) {
+    cpp_function cf(method_adaptor<type>(std::forward<Func>(f)), name(name_),
+                    is_method(*this), sibling(getattr(*this, name_, none())),
+                    extra...);
+    attr(cf.name()) = cf;
+    return *this;
+  }
 
-    template <typename Func, typename... Extra> class_ &
-    def_static(const char *name_, Func &&f, const Extra&... extra) {
-        static_assert(!std::is_member_function_pointer<Func>::value,
-                "def_static(...) called with a non-static member function pointer");
-        cpp_function cf(std::forward<Func>(f), name(name_), scope(*this),
-                        sibling(getattr(*this, name_, none())), extra...);
-        attr(cf.name()) = staticmethod(cf);
-        return *this;
-    }
+  template <typename Func, typename... Extra>
+  class_ &def_static(const char *name_, Func &&f, const Extra &... extra) {
+    static_assert(
+        !std::is_member_function_pointer<Func>::value,
+        "def_static(...) called with a non-static member function pointer");
+    cpp_function cf(std::forward<Func>(f), name(name_), scope(*this),
+                    sibling(getattr(*this, name_, none())), extra...);
+    attr(cf.name()) = staticmethod(cf);
+    return *this;
+  }
 
-    template <detail::op_id id, detail::op_type ot, typename L, typename R, typename... Extra>
-    class_ &def(const detail::op_<id, ot, L, R> &op, const Extra&... extra) {
-        op.execute(*this, extra...);
-        return *this;
-    }
+  template <detail::op_id id, detail::op_type ot, typename L, typename R,
+            typename... Extra>
+  class_ &def(const detail::op_<id, ot, L, R> &op, const Extra &... extra) {
+    op.execute(*this, extra...);
+    return *this;
+  }
 
-    template <detail::op_id id, detail::op_type ot, typename L, typename R, typename... Extra>
-    class_ & def_cast(const detail::op_<id, ot, L, R> &op, const Extra&... extra) {
-        op.execute_cast(*this, extra...);
-        return *this;
-    }
+  template <detail::op_id id, detail::op_type ot, typename L, typename R,
+            typename... Extra>
+  class_ &def_cast(const detail::op_<id, ot, L, R> &op,
+                   const Extra &... extra) {
+    op.execute_cast(*this, extra...);
+    return *this;
+  }
 
-    template <typename... Args, typename... Extra>
-    class_ &def(const detail::initimpl::constructor<Args...> &init, const Extra&... extra) {
-        init.execute(*this, extra...);
-        return *this;
-    }
+  template <typename... Args, typename... Extra>
+  class_ &def(const detail::initimpl::constructor<Args...> &init,
+              const Extra &... extra) {
+    init.execute(*this, extra...);
+    return *this;
+  }
 
-    template <typename... Args, typename... Extra>
-    class_ &def(const detail::initimpl::alias_constructor<Args...> &init, const Extra&... extra) {
-        init.execute(*this, extra...);
-        return *this;
-    }
+  template <typename... Args, typename... Extra>
+  class_ &def(const detail::initimpl::alias_constructor<Args...> &init,
+              const Extra &... extra) {
+    init.execute(*this, extra...);
+    return *this;
+  }
 
-    template <typename... Args, typename... Extra>
-    class_ &def(detail::initimpl::factory<Args...> &&init, const Extra&... extra) {
-        std::move(init).execute(*this, extra...);
-        return *this;
-    }
+  template <typename... Args, typename... Extra>
+  class_ &def(detail::initimpl::factory<Args...> &&init,
+              const Extra &... extra) {
+    std::move(init).execute(*this, extra...);
+    return *this;
+  }
 
-    template <typename... Args, typename... Extra>
-    class_ &def(detail::initimpl::pickle_factory<Args...> &&pf, const Extra &...extra) {
-        std::move(pf).execute(*this, extra...);
-        return *this;
-    }
+  template <typename... Args, typename... Extra>
+  class_ &def(detail::initimpl::pickle_factory<Args...> &&pf,
+              const Extra &... extra) {
+    std::move(pf).execute(*this, extra...);
+    return *this;
+  }
 
-    template <typename Func> class_& def_buffer(Func &&func) {
-        struct capture { Func func; };
-        capture *ptr = new capture { std::forward<Func>(func) };
-        install_buffer_funcs([](PyObject *obj, void *ptr) -> buffer_info* {
-            detail::make_caster<type> caster;
-            if (!caster.load(obj, false))
-                return nullptr;
-            return new buffer_info(((capture *) ptr)->func(caster));
-        }, ptr);
-        return *this;
-    }
+  template <typename Func> class_ &def_buffer(Func &&func) {
+    struct capture {
+      Func func;
+    };
+    capture *ptr = new capture{std::forward<Func>(func)};
+    install_buffer_funcs(
+        [](PyObject *obj, void *ptr) -> buffer_info * {
+          detail::make_caster<type> caster;
+          if (!caster.load(obj, false))
+            return nullptr;
+          return new buffer_info(((capture *)ptr)->func(caster));
+        },
+        ptr);
+    return *this;
+  }
 
-    template <typename Return, typename Class, typename... Args>
-    class_ &def_buffer(Return (Class::*func)(Args...)) {
-        return def_buffer([func] (type &obj) { return (obj.*func)(); });
-    }
+  template <typename Return, typename Class, typename... Args>
+  class_ &def_buffer(Return (Class::*func)(Args...)) {
+    return def_buffer([func](type &obj) { return (obj.*func)(); });
+  }
 
-    template <typename Return, typename Class, typename... Args>
-    class_ &def_buffer(Return (Class::*func)(Args...) const) {
-        return def_buffer([func] (const type &obj) { return (obj.*func)(); });
-    }
+  template <typename Return, typename Class, typename... Args>
+  class_ &def_buffer(Return (Class::*func)(Args...) const) {
+    return def_buffer([func](const type &obj) { return (obj.*func)(); });
+  }
 
-    template <typename C, typename D, typename... Extra>
-    class_ &def_readwrite(const char *name, D C::*pm, const Extra&... extra) {
-        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value, "def_readwrite() requires a class member (or base class member)");
-        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)),
-                     fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this));
-        def_property(name, fget, fset, return_value_policy::reference_internal, extra...);
-        return *this;
-    }
+  template <typename C, typename D, typename... Extra>
+  class_ &def_readwrite(const char *name, D C::*pm, const Extra &... extra) {
+    static_assert(
+        std::is_same<C, type>::value || std::is_base_of<C, type>::value,
+        "def_readwrite() requires a class member (or base class member)");
+    cpp_function fget([pm](const type &c) -> const D & { return c.*pm; },
+                      is_method(*this)),
+        fset([pm](type &c, const D &value) { c.*pm = value; },
+             is_method(*this));
+    def_property(name, fget, fset, return_value_policy::reference_internal,
+                 extra...);
+    return *this;
+  }
 
-    template <typename C, typename D, typename... Extra>
-    class_ &def_readonly(const char *name, const D C::*pm, const Extra& ...extra) {
-        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value, "def_readonly() requires a class member (or base class member)");
-        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this));
-        def_property_readonly(name, fget, return_value_policy::reference_internal, extra...);
-        return *this;
-    }
+  template <typename C, typename D, typename... Extra>
+  class_ &def_readonly(const char *name, const D C::*pm,
+                       const Extra &... extra) {
+    static_assert(
+        std::is_same<C, type>::value || std::is_base_of<C, type>::value,
+        "def_readonly() requires a class member (or base class member)");
+    cpp_function fget([pm](const type &c) -> const D & { return c.*pm; },
+                      is_method(*this));
+    def_property_readonly(name, fget, return_value_policy::reference_internal,
+                          extra...);
+    return *this;
+  }
 
-    template <typename D, typename... Extra>
-    class_ &def_readwrite_static(const char *name, D *pm, const Extra& ...extra) {
-        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this)),
-                     fset([pm](object, const D &value) { *pm = value; }, scope(*this));
-        def_property_static(name, fget, fset, return_value_policy::reference, extra...);
-        return *this;
-    }
+  template <typename D, typename... Extra>
+  class_ &def_readwrite_static(const char *name, D *pm,
+                               const Extra &... extra) {
+    cpp_function fget([pm](object) -> const D & { return *pm; }, scope(*this)),
+        fset([pm](object, const D &value) { *pm = value; }, scope(*this));
+    def_property_static(name, fget, fset, return_value_policy::reference,
+                        extra...);
+    return *this;
+  }
 
-    template <typename D, typename... Extra>
-    class_ &def_readonly_static(const char *name, const D *pm, const Extra& ...extra) {
-        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this));
-        def_property_readonly_static(name, fget, return_value_policy::reference, extra...);
-        return *this;
-    }
+  template <typename D, typename... Extra>
+  class_ &def_readonly_static(const char *name, const D *pm,
+                              const Extra &... extra) {
+    cpp_function fget([pm](object) -> const D & { return *pm; }, scope(*this));
+    def_property_readonly_static(name, fget, return_value_policy::reference,
+                                 extra...);
+    return *this;
+  }
 
-    /// Uses return_value_policy::reference_internal by default
-    template <typename Getter, typename... Extra>
-    class_ &def_property_readonly(const char *name, const Getter &fget, const Extra& ...extra) {
-        return def_property_readonly(name, cpp_function(method_adaptor<type>(fget)),
-                                     return_value_policy::reference_internal, extra...);
-    }
+  /// Uses return_value_policy::reference_internal by default
+  template <typename Getter, typename... Extra>
+  class_ &def_property_readonly(const char *name, const Getter &fget,
+                                const Extra &... extra) {
+    return def_property_readonly(name, cpp_function(method_adaptor<type>(fget)),
+                                 return_value_policy::reference_internal,
+                                 extra...);
+  }
 
-    /// Uses cpp_function's return_value_policy by default
-    template <typename... Extra>
-    class_ &def_property_readonly(const char *name, const cpp_function &fget, const Extra& ...extra) {
-        return def_property(name, fget, nullptr, extra...);
-    }
+  /// Uses cpp_function's return_value_policy by default
+  template <typename... Extra>
+  class_ &def_property_readonly(const char *name, const cpp_function &fget,
+                                const Extra &... extra) {
+    return def_property(name, fget, nullptr, extra...);
+  }
 
-    /// Uses return_value_policy::reference by default
-    template <typename Getter, typename... Extra>
-    class_ &def_property_readonly_static(const char *name, const Getter &fget, const Extra& ...extra) {
-        return def_property_readonly_static(name, cpp_function(fget), return_value_policy::reference, extra...);
-    }
+  /// Uses return_value_policy::reference by default
+  template <typename Getter, typename... Extra>
+  class_ &def_property_readonly_static(const char *name, const Getter &fget,
+                                       const Extra &... extra) {
+    return def_property_readonly_static(
+        name, cpp_function(fget), return_value_policy::reference, extra...);
+  }
 
-    /// Uses cpp_function's return_value_policy by default
-    template <typename... Extra>
-    class_ &def_property_readonly_static(const char *name, const cpp_function &fget, const Extra& ...extra) {
-        return def_property_static(name, fget, nullptr, extra...);
-    }
+  /// Uses cpp_function's return_value_policy by default
+  template <typename... Extra>
+  class_ &def_property_readonly_static(const char *name,
+                                       const cpp_function &fget,
+                                       const Extra &... extra) {
+    return def_property_static(name, fget, nullptr, extra...);
+  }
 
-    /// Uses return_value_policy::reference_internal by default
-    template <typename Getter, typename Setter, typename... Extra>
-    class_ &def_property(const char *name, const Getter &fget, const Setter &fset, const Extra& ...extra) {
-        return def_property(name, fget, cpp_function(method_adaptor<type>(fset)), extra...);
-    }
-    template <typename Getter, typename... Extra>
-    class_ &def_property(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
-        return def_property(name, cpp_function(method_adaptor<type>(fget)), fset,
-                            return_value_policy::reference_internal, extra...);
-    }
+  /// Uses return_value_policy::reference_internal by default
+  template <typename Getter, typename Setter, typename... Extra>
+  class_ &def_property(const char *name, const Getter &fget, const Setter &fset,
+                       const Extra &... extra) {
+    return def_property(name, fget, cpp_function(method_adaptor<type>(fset)),
+                        extra...);
+  }
+  template <typename Getter, typename... Extra>
+  class_ &def_property(const char *name, const Getter &fget,
+                       const cpp_function &fset, const Extra &... extra) {
+    return def_property(name, cpp_function(method_adaptor<type>(fget)), fset,
+                        return_value_policy::reference_internal, extra...);
+  }
 
-    /// Uses cpp_function's return_value_policy by default
-    template <typename... Extra>
-    class_ &def_property(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) {
-        return def_property_static(name, fget, fset, is_method(*this), extra...);
-    }
+  /// Uses cpp_function's return_value_policy by default
+  template <typename... Extra>
+  class_ &def_property(const char *name, const cpp_function &fget,
+                       const cpp_function &fset, const Extra &... extra) {
+    return def_property_static(name, fget, fset, is_method(*this), extra...);
+  }
 
-    /// Uses return_value_policy::reference by default
-    template <typename Getter, typename... Extra>
-    class_ &def_property_static(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
-        return def_property_static(name, cpp_function(fget), fset, return_value_policy::reference, extra...);
-    }
+  /// Uses return_value_policy::reference by default
+  template <typename Getter, typename... Extra>
+  class_ &def_property_static(const char *name, const Getter &fget,
+                              const cpp_function &fset,
+                              const Extra &... extra) {
+    return def_property_static(name, cpp_function(fget), fset,
+                               return_value_policy::reference, extra...);
+  }
 
-    /// Uses cpp_function's return_value_policy by default
-    template <typename... Extra>
-    class_ &def_property_static(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) {
-        static_assert( 0 == detail::constexpr_sum(std::is_base_of<arg, Extra>::value...),
-                      "Argument annotations are not allowed for properties");
-        auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset);
-        auto *rec_active = rec_fget;
-        if (rec_fget) {
-           char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */
-           detail::process_attributes<Extra...>::init(extra..., rec_fget);
-           if (rec_fget->doc && rec_fget->doc != doc_prev) {
-              free(doc_prev);
-              rec_fget->doc = strdup(rec_fget->doc);
-           }
-        }
-        if (rec_fset) {
-            char *doc_prev = rec_fset->doc;
-            detail::process_attributes<Extra...>::init(extra..., rec_fset);
-            if (rec_fset->doc && rec_fset->doc != doc_prev) {
-                free(doc_prev);
-                rec_fset->doc = strdup(rec_fset->doc);
-            }
-            if (! rec_active) rec_active = rec_fset;
-        }
-        def_property_static_impl(name, fget, fset, rec_active);
-        return *this;
+  /// Uses cpp_function's return_value_policy by default
+  template <typename... Extra>
+  class_ &def_property_static(const char *name, const cpp_function &fget,
+                              const cpp_function &fset,
+                              const Extra &... extra) {
+    static_assert(
+        0 == detail::constexpr_sum(std::is_base_of<arg, Extra>::value...),
+        "Argument annotations are not allowed for properties");
+    auto rec_fget = get_function_record(fget),
+         rec_fset = get_function_record(fset);
+    auto *rec_active = rec_fget;
+    if (rec_fget) {
+      char *doc_prev =
+          rec_fget->doc; /* 'extra' field may include a property-specific
+                            documentation string */
+      detail::process_attributes<Extra...>::init(extra..., rec_fget);
+      if (rec_fget->doc && rec_fget->doc != doc_prev) {
+        free(doc_prev);
+        rec_fget->doc = strdup(rec_fget->doc);
+      }
     }
+    if (rec_fset) {
+      char *doc_prev = rec_fset->doc;
+      detail::process_attributes<Extra...>::init(extra..., rec_fset);
+      if (rec_fset->doc && rec_fset->doc != doc_prev) {
+        free(doc_prev);
+        rec_fset->doc = strdup(rec_fset->doc);
+      }
+      if (!rec_active)
+        rec_active = rec_fset;
+    }
+    def_property_static_impl(name, fget, fset, rec_active);
+    return *this;
+  }
 
 private:
-    /// Initialize holder object, variant 1: object derives from enable_shared_from_this
-    template <typename T>
-    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
-            const holder_type * /* unused */, const std::enable_shared_from_this<T> * /* dummy */) {
-        try {
-            auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
-                    v_h.value_ptr<type>()->shared_from_this());
-            if (sh) {
-                new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(sh));
-                v_h.set_holder_constructed();
-            }
-        } catch (const std::bad_weak_ptr &) {}
-
-        if (!v_h.holder_constructed() && inst->owned) {
-            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
-            v_h.set_holder_constructed();
-        }
+  /// Initialize holder object, variant 1: object derives from
+  /// enable_shared_from_this
+  template <typename T>
+  static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+                          const holder_type * /* unused */,
+                          const std::enable_shared_from_this<T> * /* dummy */) {
+    try {
+      auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
+          v_h.value_ptr<type>()->shared_from_this());
+      if (sh) {
+        new (std::addressof(v_h.holder<holder_type>()))
+            holder_type(std::move(sh));
+        v_h.set_holder_constructed();
+      }
+    } catch (const std::bad_weak_ptr &) {
     }
 
-    static void init_holder_from_existing(const detail::value_and_holder &v_h,
-            const holder_type *holder_ptr, std::true_type /*is_copy_constructible*/) {
-        new (std::addressof(v_h.holder<holder_type>())) holder_type(*reinterpret_cast<const holder_type *>(holder_ptr));
+    if (!v_h.holder_constructed() && inst->owned) {
+      new (std::addressof(v_h.holder<holder_type>()))
+          holder_type(v_h.value_ptr<type>());
+      v_h.set_holder_constructed();
     }
+  }
 
-    static void init_holder_from_existing(const detail::value_and_holder &v_h,
-            const holder_type *holder_ptr, std::false_type /*is_copy_constructible*/) {
-        new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(*const_cast<holder_type *>(holder_ptr)));
-    }
+  static void
+  init_holder_from_existing(const detail::value_and_holder &v_h,
+                            const holder_type *holder_ptr,
+                            std::true_type /*is_copy_constructible*/) {
+    new (std::addressof(v_h.holder<holder_type>()))
+        holder_type(*reinterpret_cast<const holder_type *>(holder_ptr));
+  }
 
-    /// Initialize holder object, variant 2: try to construct from existing holder object, if possible
-    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
-            const holder_type *holder_ptr, const void * /* dummy -- not enable_shared_from_this<T>) */) {
-        if (holder_ptr) {
-            init_holder_from_existing(v_h, holder_ptr, std::is_copy_constructible<holder_type>());
-            v_h.set_holder_constructed();
-        } else if (inst->owned || detail::always_construct_holder<holder_type>::value) {
-            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
-            v_h.set_holder_constructed();
-        }
-    }
+  static void
+  init_holder_from_existing(const detail::value_and_holder &v_h,
+                            const holder_type *holder_ptr,
+                            std::false_type /*is_copy_constructible*/) {
+    new (std::addressof(v_h.holder<holder_type>()))
+        holder_type(std::move(*const_cast<holder_type *>(holder_ptr)));
+  }
 
-    /// Performs instance initialization including constructing a holder and registering the known
-    /// instance.  Should be called as soon as the `type` value_ptr is set for an instance.  Takes an
-    /// optional pointer to an existing holder to use; if not specified and the instance is
-    /// `.owned`, a new holder will be constructed to manage the value pointer.
-    static void init_instance(detail::instance *inst, const void *holder_ptr) {
-        auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
-        if (!v_h.instance_registered()) {
-            register_instance(inst, v_h.value_ptr(), v_h.type);
-            v_h.set_instance_registered();
-        }
-        init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr<type>());
+  /// Initialize holder object, variant 2: try to construct from existing holder
+  /// object, if possible
+  static void
+  init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+              const holder_type *holder_ptr,
+              const void * /* dummy -- not enable_shared_from_this<T>) */) {
+    if (holder_ptr) {
+      init_holder_from_existing(v_h, holder_ptr,
+                                std::is_copy_constructible<holder_type>());
+      v_h.set_holder_constructed();
+    } else if (inst->owned ||
+               detail::always_construct_holder<holder_type>::value) {
+      new (std::addressof(v_h.holder<holder_type>()))
+          holder_type(v_h.value_ptr<type>());
+      v_h.set_holder_constructed();
     }
+  }
 
-    /// Deallocates an instance; via holder, if constructed; otherwise via operator delete.
-    static void dealloc(detail::value_and_holder &v_h) {
-        if (v_h.holder_constructed()) {
-            v_h.holder<holder_type>().~holder_type();
-            v_h.set_holder_constructed(false);
-        }
-        else {
-            detail::call_operator_delete(v_h.value_ptr<type>(),
-                v_h.type->type_size,
-                v_h.type->type_align
-            );
-        }
-        v_h.value_ptr() = nullptr;
+  /// Performs instance initialization including constructing a holder and
+  /// registering the known instance.  Should be called as soon as the `type`
+  /// value_ptr is set for an instance.  Takes an optional pointer to an
+  /// existing holder to use; if not specified and the instance is
+  /// `.owned`, a new holder will be constructed to manage the value pointer.
+  static void init_instance(detail::instance *inst, const void *holder_ptr) {
+    auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
+    if (!v_h.instance_registered()) {
+      register_instance(inst, v_h.value_ptr(), v_h.type);
+      v_h.set_instance_registered();
     }
+    init_holder(inst, v_h, (const holder_type *)holder_ptr,
+                v_h.value_ptr<type>());
+  }
 
-    static detail::function_record *get_function_record(handle h) {
-        h = detail::get_function(h);
-        return h ? (detail::function_record *) reinterpret_borrow<capsule>(PyCFunction_GET_SELF(h.ptr()))
-                 : nullptr;
+  /// Deallocates an instance; via holder, if constructed; otherwise via
+  /// operator delete.
+  static void dealloc(detail::value_and_holder &v_h) {
+    if (v_h.holder_constructed()) {
+      v_h.holder<holder_type>().~holder_type();
+      v_h.set_holder_constructed(false);
+    } else {
+      detail::call_operator_delete(v_h.value_ptr<type>(), v_h.type->type_size,
+                                   v_h.type->type_align);
     }
+    v_h.value_ptr() = nullptr;
+  }
+
+  static detail::function_record *get_function_record(handle h) {
+    h = detail::get_function(h);
+    return h ? (detail::function_record *)reinterpret_borrow<capsule>(
+                   PyCFunction_GET_SELF(h.ptr()))
+             : nullptr;
+  }
 };
 
 /// Binds an existing constructor taking arguments Args...
-template <typename... Args> detail::initimpl::constructor<Args...> init() { return {}; }
-/// Like `init<Args...>()`, but the instance is always constructed through the alias class (even
-/// when not inheriting on the Python side).
-template <typename... Args> detail::initimpl::alias_constructor<Args...> init_alias() { return {}; }
+template <typename... Args> detail::initimpl::constructor<Args...> init() {
+  return {};
+}
+/// Like `init<Args...>()`, but the instance is always constructed through the
+/// alias class (even when not inheriting on the Python side).
+template <typename... Args>
+detail::initimpl::alias_constructor<Args...> init_alias() {
+  return {};
+}
 
 /// Binds a factory function as a constructor
 template <typename Func, typename Ret = detail::initimpl::factory<Func>>
-Ret init(Func &&f) { return {std::forward<Func>(f)}; }
-
-/// Dual-argument factory function: the first function is called when no alias is needed, the second
-/// when an alias is needed (i.e. due to python-side inheritance).  Arguments must be identical.
-template <typename CFunc, typename AFunc, typename Ret = detail::initimpl::factory<CFunc, AFunc>>
-Ret init(CFunc &&c, AFunc &&a) {
-    return {std::forward<CFunc>(c), std::forward<AFunc>(a)};
+Ret init(Func &&f) {
+  return {std::forward<Func>(f)};
 }
 
-/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that the type
-/// returned by `__getstate__` is the same as the argument accepted by `__setstate__`.
+/// Dual-argument factory function: the first function is called when no alias
+/// is needed, the second when an alias is needed (i.e. due to python-side
+/// inheritance).  Arguments must be identical.
+template <typename CFunc, typename AFunc,
+          typename Ret = detail::initimpl::factory<CFunc, AFunc>>
+Ret init(CFunc &&c, AFunc &&a) {
+  return {std::forward<CFunc>(c), std::forward<AFunc>(a)};
+}
+
+/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that
+/// the type returned by `__getstate__` is the same as the argument accepted by
+/// `__setstate__`.
 template <typename GetState, typename SetState>
-detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g, SetState &&s) {
-    return {std::forward<GetState>(g), std::forward<SetState>(s)};
+detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g,
+                                                            SetState &&s) {
+  return {std::forward<GetState>(g), std::forward<SetState>(s)};
 }
 
 NAMESPACE_BEGIN(detail)
 struct enum_base {
-    enum_base(handle base, handle parent) : m_base(base), m_parent(parent) { }
+  enum_base(handle base, handle parent) : m_base(base), m_parent(parent) {}
 
-    PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) {
-        m_base.attr("__entries") = dict();
-        auto property = handle((PyObject *) &PyProperty_Type);
-        auto static_property = handle((PyObject *) get_internals().static_property_type);
+  PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) {
+    m_base.attr("__entries") = dict();
+    auto property = handle((PyObject *)&PyProperty_Type);
+    auto static_property =
+        handle((PyObject *)get_internals().static_property_type);
 
-        m_base.attr("__repr__") = cpp_function(
-            [](handle arg) -> str {
-                handle type = arg.get_type();
-                object type_name = type.attr("__name__");
-                dict entries = type.attr("__entries");
-                for (const auto &kv : entries) {
-                    object other = kv.second[int_(0)];
-                    if (other.equal(arg))
-                        return pybind11::str("{}.{}").format(type_name, kv.first);
-                }
-                return pybind11::str("{}.???").format(type_name);
-            }, is_method(m_base)
-        );
+    m_base.attr("__repr__") = cpp_function(
+        [](handle arg) -> str {
+          handle type = arg.get_type();
+          object type_name = type.attr("__name__");
+          dict entries = type.attr("__entries");
+          for (const auto &kv : entries) {
+            object other = kv.second[int_(0)];
+            if (other.equal(arg))
+              return pybind11::str("{}.{}").format(type_name, kv.first);
+          }
+          return pybind11::str("{}.???").format(type_name);
+        },
+        is_method(m_base));
 
-        m_base.attr("name") = property(cpp_function(
-            [](handle arg) -> str {
-                dict entries = arg.get_type().attr("__entries");
-                for (const auto &kv : entries) {
-                    if (handle(kv.second[int_(0)]).equal(arg))
-                        return pybind11::str(kv.first);
-                }
-                return "???";
-            }, is_method(m_base)
-        ));
+    m_base.attr("name") = property(cpp_function(
+        [](handle arg) -> str {
+          dict entries = arg.get_type().attr("__entries");
+          for (const auto &kv : entries) {
+            if (handle(kv.second[int_(0)]).equal(arg))
+              return pybind11::str(kv.first);
+          }
+          return "???";
+        },
+        is_method(m_base)));
 
-        m_base.attr("__doc__") = static_property(cpp_function(
-            [](handle arg) -> std::string {
-                std::string docstring;
-                dict entries = arg.attr("__entries");
-                if (((PyTypeObject *) arg.ptr())->tp_doc)
-                    docstring += std::string(((PyTypeObject *) arg.ptr())->tp_doc) + "\n\n";
-                docstring += "Members:";
-                for (const auto &kv : entries) {
-                    auto key = std::string(pybind11::str(kv.first));
-                    auto comment = kv.second[int_(1)];
-                    docstring += "\n\n  " + key;
-                    if (!comment.is_none())
-                        docstring += " : " + (std::string) pybind11::str(comment);
-                }
-                return docstring;
-            }
-        ), none(), none(), "");
+    m_base.attr("__doc__") = static_property(
+        cpp_function([](handle arg) -> std::string {
+          std::string docstring;
+          dict entries = arg.attr("__entries");
+          if (((PyTypeObject *)arg.ptr())->tp_doc)
+            docstring +=
+                std::string(((PyTypeObject *)arg.ptr())->tp_doc) + "\n\n";
+          docstring += "Members:";
+          for (const auto &kv : entries) {
+            auto key = std::string(pybind11::str(kv.first));
+            auto comment = kv.second[int_(1)];
+            docstring += "\n\n  " + key;
+            if (!comment.is_none())
+              docstring += " : " + (std::string)pybind11::str(comment);
+          }
+          return docstring;
+        }),
+        none(), none(), "");
 
-        m_base.attr("__members__") = static_property(cpp_function(
-            [](handle arg) -> dict {
-                dict entries = arg.attr("__entries"), m;
-                for (const auto &kv : entries)
-                    m[kv.first] = kv.second[int_(0)];
-                return m;
-            }), none(), none(), ""
-        );
+    m_base.attr("__members__") =
+        static_property(cpp_function([](handle arg) -> dict {
+                          dict entries = arg.attr("__entries"), m;
+                          for (const auto &kv : entries)
+                            m[kv.first] = kv.second[int_(0)];
+                          return m;
+                        }),
+                        none(), none(), "");
 
-        #define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                     \
-            m_base.attr(op) = cpp_function(                                            \
-                [](object a, object b) {                                               \
-                    if (!a.get_type().is(b.get_type()))                                \
-                        strict_behavior;                                               \
-                    return expr;                                                       \
-                },                                                                     \
-                is_method(m_base))
+#define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                     \
+  m_base.attr(op) = cpp_function(                                              \
+      [](object a, object b) {                                                 \
+        if (!a.get_type().is(b.get_type()))                                    \
+          strict_behavior;                                                     \
+        return expr;                                                           \
+      },                                                                       \
+      is_method(m_base))
 
-        #define PYBIND11_ENUM_OP_CONV(op, expr)                                        \
-            m_base.attr(op) = cpp_function(                                            \
-                [](object a_, object b_) {                                             \
-                    int_ a(a_), b(b_);                                                 \
-                    return expr;                                                       \
-                },                                                                     \
-                is_method(m_base))
+#define PYBIND11_ENUM_OP_CONV(op, expr)                                        \
+  m_base.attr(op) = cpp_function(                                              \
+      [](object a_, object b_) {                                               \
+        int_ a(a_), b(b_);                                                     \
+        return expr;                                                           \
+      },                                                                       \
+      is_method(m_base))
 
-        if (is_convertible) {
-            PYBIND11_ENUM_OP_CONV("__eq__", !b.is_none() &&  a.equal(b));
-            PYBIND11_ENUM_OP_CONV("__ne__",  b.is_none() || !a.equal(b));
+    if (is_convertible) {
+      PYBIND11_ENUM_OP_CONV("__eq__", !b.is_none() && a.equal(b));
+      PYBIND11_ENUM_OP_CONV("__ne__", b.is_none() || !a.equal(b));
 
-            if (is_arithmetic) {
-                PYBIND11_ENUM_OP_CONV("__lt__",   a <  b);
-                PYBIND11_ENUM_OP_CONV("__gt__",   a >  b);
-                PYBIND11_ENUM_OP_CONV("__le__",   a <= b);
-                PYBIND11_ENUM_OP_CONV("__ge__",   a >= b);
-                PYBIND11_ENUM_OP_CONV("__and__",  a &  b);
-                PYBIND11_ENUM_OP_CONV("__rand__", a &  b);
-                PYBIND11_ENUM_OP_CONV("__or__",   a |  b);
-                PYBIND11_ENUM_OP_CONV("__ror__",  a |  b);
-                PYBIND11_ENUM_OP_CONV("__xor__",  a ^  b);
-                PYBIND11_ENUM_OP_CONV("__rxor__", a ^  b);
-            }
-        } else {
-            PYBIND11_ENUM_OP_STRICT("__eq__",  int_(a).equal(int_(b)), return false);
-            PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true);
+      if (is_arithmetic) {
+        PYBIND11_ENUM_OP_CONV("__lt__", a < b);
+        PYBIND11_ENUM_OP_CONV("__gt__", a > b);
+        PYBIND11_ENUM_OP_CONV("__le__", a <= b);
+        PYBIND11_ENUM_OP_CONV("__ge__", a >= b);
+        PYBIND11_ENUM_OP_CONV("__and__", a & b);
+        PYBIND11_ENUM_OP_CONV("__rand__", a & b);
+        PYBIND11_ENUM_OP_CONV("__or__", a | b);
+        PYBIND11_ENUM_OP_CONV("__ror__", a | b);
+        PYBIND11_ENUM_OP_CONV("__xor__", a ^ b);
+        PYBIND11_ENUM_OP_CONV("__rxor__", a ^ b);
+      }
+    } else {
+      PYBIND11_ENUM_OP_STRICT("__eq__", int_(a).equal(int_(b)), return false);
+      PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true);
 
-            if (is_arithmetic) {
-                #define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!");
-                PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) <  int_(b), PYBIND11_THROW);
-                PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) >  int_(b), PYBIND11_THROW);
-                PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW);
-                PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW);
-                #undef PYBIND11_THROW
-            }
-        }
-
-        #undef PYBIND11_ENUM_OP_CONV
-        #undef PYBIND11_ENUM_OP_STRICT
-
-        object getstate = cpp_function(
-            [](object arg) { return int_(arg); }, is_method(m_base));
-
-        m_base.attr("__getstate__") = getstate;
-        m_base.attr("__hash__") = getstate;
+      if (is_arithmetic) {
+#define PYBIND11_THROW                                                         \
+  throw type_error("Expected an enumeration of matching type!");
+        PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) < int_(b), PYBIND11_THROW);
+        PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) > int_(b), PYBIND11_THROW);
+        PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW);
+        PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW);
+#undef PYBIND11_THROW
+      }
     }
 
-    PYBIND11_NOINLINE void value(char const* name_, object value, const char *doc = nullptr) {
-        dict entries = m_base.attr("__entries");
-        str name(name_);
-        if (entries.contains(name)) {
-            std::string type_name = (std::string) str(m_base.attr("__name__"));
-            throw value_error(type_name + ": element \"" + std::string(name_) + "\" already exists!");
-        }
+#undef PYBIND11_ENUM_OP_CONV
+#undef PYBIND11_ENUM_OP_STRICT
 
-        entries[name] = std::make_pair(value, doc);
-        m_base.attr(name) = value;
+    object getstate =
+        cpp_function([](object arg) { return int_(arg); }, is_method(m_base));
+
+    m_base.attr("__getstate__") = getstate;
+    m_base.attr("__hash__") = getstate;
+  }
+
+  PYBIND11_NOINLINE void value(char const *name_, object value,
+                               const char *doc = nullptr) {
+    dict entries = m_base.attr("__entries");
+    str name(name_);
+    if (entries.contains(name)) {
+      std::string type_name = (std::string)str(m_base.attr("__name__"));
+      throw value_error(type_name + ": element \"" + std::string(name_) +
+                        "\" already exists!");
     }
 
-    PYBIND11_NOINLINE void export_values() {
-        dict entries = m_base.attr("__entries");
-        for (const auto &kv : entries)
-            m_parent.attr(kv.first) = kv.second[int_(0)];
-    }
+    entries[name] = std::make_pair(value, doc);
+    m_base.attr(name) = value;
+  }
 
-    handle m_base;
-    handle m_parent;
+  PYBIND11_NOINLINE void export_values() {
+    dict entries = m_base.attr("__entries");
+    for (const auto &kv : entries)
+      m_parent.attr(kv.first) = kv.second[int_(0)];
+  }
+
+  handle m_base;
+  handle m_parent;
 };
 
 NAMESPACE_END(detail)
@@ -1537,221 +1774,236 @@ NAMESPACE_END(detail)
 /// Binds C++ enumerations and enumeration classes to Python
 template <typename Type> class enum_ : public class_<Type> {
 public:
-    using Base = class_<Type>;
-    using Base::def;
-    using Base::attr;
-    using Base::def_property_readonly;
-    using Base::def_property_readonly_static;
-    using Scalar = typename std::underlying_type<Type>::type;
+  using Base = class_<Type>;
+  using Base::attr;
+  using Base::def;
+  using Base::def_property_readonly;
+  using Base::def_property_readonly_static;
+  using Scalar = typename std::underlying_type<Type>::type;
 
-    template <typename... Extra>
-    enum_(const handle &scope, const char *name, const Extra&... extra)
+  template <typename... Extra>
+  enum_(const handle &scope, const char *name, const Extra &... extra)
       : class_<Type>(scope, name, extra...), m_base(*this, scope) {
-        constexpr bool is_arithmetic = detail::any_of<std::is_same<arithmetic, Extra>...>::value;
-        constexpr bool is_convertible = std::is_convertible<Type, Scalar>::value;
-        m_base.init(is_arithmetic, is_convertible);
+    constexpr bool is_arithmetic =
+        detail::any_of<std::is_same<arithmetic, Extra>...>::value;
+    constexpr bool is_convertible = std::is_convertible<Type, Scalar>::value;
+    m_base.init(is_arithmetic, is_convertible);
 
-        def(init([](Scalar i) { return static_cast<Type>(i); }));
-        def("__int__", [](Type value) { return (Scalar) value; });
-        #if PY_MAJOR_VERSION < 3
-            def("__long__", [](Type value) { return (Scalar) value; });
-        #endif
-        cpp_function setstate(
-            [](Type &value, Scalar arg) { value = static_cast<Type>(arg); },
-            is_method(*this));
-        attr("__setstate__") = setstate;
-    }
+    def(init([](Scalar i) { return static_cast<Type>(i); }));
+    def("__int__", [](Type value) { return (Scalar)value; });
+#if PY_MAJOR_VERSION < 3
+    def("__long__", [](Type value) { return (Scalar)value; });
+#endif
+    cpp_function setstate(
+        [](Type &value, Scalar arg) { value = static_cast<Type>(arg); },
+        is_method(*this));
+    attr("__setstate__") = setstate;
+  }
 
-    /// Export enumeration entries into the parent scope
-    enum_& export_values() {
-        m_base.export_values();
-        return *this;
-    }
+  /// Export enumeration entries into the parent scope
+  enum_ &export_values() {
+    m_base.export_values();
+    return *this;
+  }
 
-    /// Add an enumeration entry
-    enum_& value(char const* name, Type value, const char *doc = nullptr) {
-        m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc);
-        return *this;
-    }
+  /// Add an enumeration entry
+  enum_ &value(char const *name, Type value, const char *doc = nullptr) {
+    m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc);
+    return *this;
+  }
 
 private:
-    detail::enum_base m_base;
+  detail::enum_base m_base;
 };
 
 NAMESPACE_BEGIN(detail)
 
-
 inline void keep_alive_impl(handle nurse, handle patient) {
-    if (!nurse || !patient)
-        pybind11_fail("Could not activate keep_alive!");
+  if (!nurse || !patient)
+    pybind11_fail("Could not activate keep_alive!");
 
-    if (patient.is_none() || nurse.is_none())
-        return; /* Nothing to keep alive or nothing to be kept alive by */
+  if (patient.is_none() || nurse.is_none())
+    return; /* Nothing to keep alive or nothing to be kept alive by */
 
-    auto tinfo = all_type_info(Py_TYPE(nurse.ptr()));
-    if (!tinfo.empty()) {
-        /* It's a pybind-registered type, so we can store the patient in the
-         * internal list. */
-        add_patient(nurse.ptr(), patient.ptr());
-    }
-    else {
-        /* Fall back to clever approach based on weak references taken from
-         * Boost.Python. This is not used for pybind-registered types because
-         * the objects can be destroyed out-of-order in a GC pass. */
-        cpp_function disable_lifesupport(
-            [patient](handle weakref) { patient.dec_ref(); weakref.dec_ref(); });
+  auto tinfo = all_type_info(Py_TYPE(nurse.ptr()));
+  if (!tinfo.empty()) {
+    /* It's a pybind-registered type, so we can store the patient in the
+     * internal list. */
+    add_patient(nurse.ptr(), patient.ptr());
+  } else {
+    /* Fall back to clever approach based on weak references taken from
+     * Boost.Python. This is not used for pybind-registered types because
+     * the objects can be destroyed out-of-order in a GC pass. */
+    cpp_function disable_lifesupport([patient](handle weakref) {
+      patient.dec_ref();
+      weakref.dec_ref();
+    });
 
-        weakref wr(nurse, disable_lifesupport);
+    weakref wr(nurse, disable_lifesupport);
 
-        patient.inc_ref(); /* reference patient and leak the weak reference */
-        (void) wr.release();
-    }
+    patient.inc_ref(); /* reference patient and leak the weak reference */
+    (void)wr.release();
+  }
 }
 
-PYBIND11_NOINLINE inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
-    auto get_arg = [&](size_t n) {
-        if (n == 0)
-            return ret;
-        else if (n == 1 && call.init_self)
-            return call.init_self;
-        else if (n <= call.args.size())
-            return call.args[n - 1];
-        return handle();
-    };
+PYBIND11_NOINLINE inline void keep_alive_impl(size_t Nurse, size_t Patient,
+                                              function_call &call, handle ret) {
+  auto get_arg = [&](size_t n) {
+    if (n == 0)
+      return ret;
+    else if (n == 1 && call.init_self)
+      return call.init_self;
+    else if (n <= call.args.size())
+      return call.args[n - 1];
+    return handle();
+  };
 
-    keep_alive_impl(get_arg(Nurse), get_arg(Patient));
+  keep_alive_impl(get_arg(Nurse), get_arg(Patient));
 }
 
-inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type) {
-    auto res = get_internals().registered_types_py
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool>
+all_type_info_get_cache(PyTypeObject *type) {
+  auto res = get_internals()
+                 .registered_types_py
 #ifdef __cpp_lib_unordered_map_try_emplace
-        .try_emplace(type);
+                 .try_emplace(type);
 #else
-        .emplace(type, std::vector<detail::type_info *>());
+                 .emplace(type, std::vector<detail::type_info *>());
 #endif
-    if (res.second) {
-        // New cache entry created; set up a weak reference to automatically remove it if the type
-        // gets destroyed:
-        weakref((PyObject *) type, cpp_function([type](handle wr) {
-            get_internals().registered_types_py.erase(type);
-            wr.dec_ref();
-        })).release();
-    }
+  if (res.second) {
+    // New cache entry created; set up a weak reference to automatically remove
+    // it if the type gets destroyed:
+    weakref((PyObject *)type, cpp_function([type](handle wr) {
+              get_internals().registered_types_py.erase(type);
+              wr.dec_ref();
+            }))
+        .release();
+  }
 
-    return res;
+  return res;
 }
 
-template <typename Iterator, typename Sentinel, bool KeyIterator, return_value_policy Policy>
+template <typename Iterator, typename Sentinel, bool KeyIterator,
+          return_value_policy Policy>
 struct iterator_state {
-    Iterator it;
-    Sentinel end;
-    bool first_or_done;
+  Iterator it;
+  Sentinel end;
+  bool first_or_done;
 };
 
 NAMESPACE_END(detail)
 
 /// Makes a python iterator from a first and past-the-end C++ InputIterator.
 template <return_value_policy Policy = return_value_policy::reference_internal,
-          typename Iterator,
-          typename Sentinel,
+          typename Iterator, typename Sentinel,
           typename ValueType = decltype(*std::declval<Iterator>()),
           typename... Extra>
 iterator make_iterator(Iterator first, Sentinel last, Extra &&... extra) {
-    typedef detail::iterator_state<Iterator, Sentinel, false, Policy> state;
+  typedef detail::iterator_state<Iterator, Sentinel, false, Policy> state;
 
-    if (!detail::get_type_info(typeid(state), false)) {
-        class_<state>(handle(), "iterator", pybind11::module_local())
-            .def("__iter__", [](state &s) -> state& { return s; })
-            .def("__next__", [](state &s) -> ValueType {
-                if (!s.first_or_done)
-                    ++s.it;
-                else
-                    s.first_or_done = false;
-                if (s.it == s.end) {
-                    s.first_or_done = true;
-                    throw stop_iteration();
-                }
-                return *s.it;
-            }, std::forward<Extra>(extra)..., Policy);
-    }
+  if (!detail::get_type_info(typeid(state), false)) {
+    class_<state>(handle(), "iterator", pybind11::module_local())
+        .def("__iter__", [](state &s) -> state & { return s; })
+        .def(
+            "__next__",
+            [](state &s) -> ValueType {
+              if (!s.first_or_done)
+                ++s.it;
+              else
+                s.first_or_done = false;
+              if (s.it == s.end) {
+                s.first_or_done = true;
+                throw stop_iteration();
+              }
+              return *s.it;
+            },
+            std::forward<Extra>(extra)..., Policy);
+  }
 
-    return cast(state{first, last, true});
+  return cast(state{first, last, true});
 }
 
-/// Makes an python iterator over the keys (`.first`) of a iterator over pairs from a
-/// first and past-the-end InputIterator.
+/// Makes an python iterator over the keys (`.first`) of a iterator over pairs
+/// from a first and past-the-end InputIterator.
 template <return_value_policy Policy = return_value_policy::reference_internal,
-          typename Iterator,
-          typename Sentinel,
+          typename Iterator, typename Sentinel,
           typename KeyType = decltype((*std::declval<Iterator>()).first),
           typename... Extra>
 iterator make_key_iterator(Iterator first, Sentinel last, Extra &&... extra) {
-    typedef detail::iterator_state<Iterator, Sentinel, true, Policy> state;
+  typedef detail::iterator_state<Iterator, Sentinel, true, Policy> state;
 
-    if (!detail::get_type_info(typeid(state), false)) {
-        class_<state>(handle(), "iterator", pybind11::module_local())
-            .def("__iter__", [](state &s) -> state& { return s; })
-            .def("__next__", [](state &s) -> KeyType {
-                if (!s.first_or_done)
-                    ++s.it;
-                else
-                    s.first_or_done = false;
-                if (s.it == s.end) {
-                    s.first_or_done = true;
-                    throw stop_iteration();
-                }
-                return (*s.it).first;
-            }, std::forward<Extra>(extra)..., Policy);
-    }
+  if (!detail::get_type_info(typeid(state), false)) {
+    class_<state>(handle(), "iterator", pybind11::module_local())
+        .def("__iter__", [](state &s) -> state & { return s; })
+        .def(
+            "__next__",
+            [](state &s) -> KeyType {
+              if (!s.first_or_done)
+                ++s.it;
+              else
+                s.first_or_done = false;
+              if (s.it == s.end) {
+                s.first_or_done = true;
+                throw stop_iteration();
+              }
+              return (*s.it).first;
+            },
+            std::forward<Extra>(extra)..., Policy);
+  }
 
-    return cast(state{first, last, true});
+  return cast(state{first, last, true});
 }
 
-/// Makes an iterator over values of an stl container or other container supporting
-/// `std::begin()`/`std::end()`
+/// Makes an iterator over values of an stl container or other container
+/// supporting `std::begin()`/`std::end()`
 template <return_value_policy Policy = return_value_policy::reference_internal,
-          typename Type, typename... Extra> iterator make_iterator(Type &value, Extra&&... extra) {
-    return make_iterator<Policy>(std::begin(value), std::end(value), extra...);
+          typename Type, typename... Extra>
+iterator make_iterator(Type &value, Extra &&... extra) {
+  return make_iterator<Policy>(std::begin(value), std::end(value), extra...);
 }
 
-/// Makes an iterator over the keys (`.first`) of a stl map-like container supporting
-/// `std::begin()`/`std::end()`
+/// Makes an iterator over the keys (`.first`) of a stl map-like container
+/// supporting `std::begin()`/`std::end()`
 template <return_value_policy Policy = return_value_policy::reference_internal,
-          typename Type, typename... Extra> iterator make_key_iterator(Type &value, Extra&&... extra) {
-    return make_key_iterator<Policy>(std::begin(value), std::end(value), extra...);
+          typename Type, typename... Extra>
+iterator make_key_iterator(Type &value, Extra &&... extra) {
+  return make_key_iterator<Policy>(std::begin(value), std::end(value),
+                                   extra...);
 }
 
-template <typename InputType, typename OutputType> void implicitly_convertible() {
-    struct set_flag {
-        bool &flag;
-        set_flag(bool &flag) : flag(flag) { flag = true; }
-        ~set_flag() { flag = false; }
-    };
-    auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
-        static bool currently_used = false;
-        if (currently_used) // implicit conversions are non-reentrant
-            return nullptr;
-        set_flag flag_helper(currently_used);
-        if (!detail::make_caster<InputType>().load(obj, false))
-            return nullptr;
-        tuple args(1);
-        args[0] = obj;
-        PyObject *result = PyObject_Call((PyObject *) type, args.ptr(), nullptr);
-        if (result == nullptr)
-            PyErr_Clear();
-        return result;
-    };
+template <typename InputType, typename OutputType>
+void implicitly_convertible() {
+  struct set_flag {
+    bool &flag;
+    set_flag(bool &flag) : flag(flag) { flag = true; }
+    ~set_flag() { flag = false; }
+  };
+  auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
+    static bool currently_used = false;
+    if (currently_used) // implicit conversions are non-reentrant
+      return nullptr;
+    set_flag flag_helper(currently_used);
+    if (!detail::make_caster<InputType>().load(obj, false))
+      return nullptr;
+    tuple args(1);
+    args[0] = obj;
+    PyObject *result = PyObject_Call((PyObject *)type, args.ptr(), nullptr);
+    if (result == nullptr)
+      PyErr_Clear();
+    return result;
+  };
 
-    if (auto tinfo = detail::get_type_info(typeid(OutputType)))
-        tinfo->implicit_conversions.push_back(implicit_caster);
-    else
-        pybind11_fail("implicitly_convertible: Unable to find type " + type_id<OutputType>());
+  if (auto tinfo = detail::get_type_info(typeid(OutputType)))
+    tinfo->implicit_conversions.push_back(implicit_caster);
+  else
+    pybind11_fail("implicitly_convertible: Unable to find type " +
+                  type_id<OutputType>());
 }
 
 template <typename ExceptionTranslator>
-void register_exception_translator(ExceptionTranslator&& translator) {
-    detail::get_internals().registered_exception_translators.push_front(
-        std::forward<ExceptionTranslator>(translator));
+void register_exception_translator(ExceptionTranslator &&translator) {
+  detail::get_internals().registered_exception_translators.push_front(
+      std::forward<ExceptionTranslator>(translator));
 }
 
 /**
@@ -1761,95 +2013,102 @@ void register_exception_translator(ExceptionTranslator&& translator) {
  * It is not (yet) possible to use as a py::base.
  * Template type argument is reserved for future use.
  */
-template <typename type>
-class exception : public object {
+template <typename type> class exception : public object {
 public:
-    exception() = default;
-    exception(handle scope, const char *name, PyObject *base = PyExc_Exception) {
-        std::string full_name = scope.attr("__name__").cast<std::string>() +
-                                std::string(".") + name;
-        m_ptr = PyErr_NewException(const_cast<char *>(full_name.c_str()), base, NULL);
-        if (hasattr(scope, name))
-            pybind11_fail("Error during initialization: multiple incompatible "
-                          "definitions with name \"" + std::string(name) + "\"");
-        scope.attr(name) = *this;
-    }
+  exception() = default;
+  exception(handle scope, const char *name, PyObject *base = PyExc_Exception) {
+    std::string full_name =
+        scope.attr("__name__").cast<std::string>() + std::string(".") + name;
+    m_ptr =
+        PyErr_NewException(const_cast<char *>(full_name.c_str()), base, NULL);
+    if (hasattr(scope, name))
+      pybind11_fail("Error during initialization: multiple incompatible "
+                    "definitions with name \"" +
+                    std::string(name) + "\"");
+    scope.attr(name) = *this;
+  }
 
-    // Sets the current python exception to this exception object with the given message
-    void operator()(const char *message) {
-        PyErr_SetString(m_ptr, message);
-    }
+  // Sets the current python exception to this exception object with the given
+  // message
+  void operator()(const char *message) { PyErr_SetString(m_ptr, message); }
 };
 
 NAMESPACE_BEGIN(detail)
-// Returns a reference to a function-local static exception object used in the simple
-// register_exception approach below.  (It would be simpler to have the static local variable
-// directly in register_exception, but that makes clang <3.5 segfault - issue #1349).
+// Returns a reference to a function-local static exception object used in the
+// simple register_exception approach below.  (It would be simpler to have the
+// static local variable directly in register_exception, but that makes clang
+// <3.5 segfault - issue #1349).
 template <typename CppException>
-exception<CppException> &get_exception_object() { static exception<CppException> ex; return ex; }
+exception<CppException> &get_exception_object() {
+  static exception<CppException> ex;
+  return ex;
+}
 NAMESPACE_END(detail)
 
 /**
- * Registers a Python exception in `m` of the given `name` and installs an exception translator to
- * translate the C++ exception to the created Python exception using the exceptions what() method.
- * This is intended for simple exception translations; for more complex translation, register the
- * exception object and translator directly.
+ * Registers a Python exception in `m` of the given `name` and installs an
+ * exception translator to translate the C++ exception to the created Python
+ * exception using the exceptions what() method. This is intended for simple
+ * exception translations; for more complex translation, register the exception
+ * object and translator directly.
  */
 template <typename CppException>
-exception<CppException> &register_exception(handle scope,
-                                            const char *name,
+exception<CppException> &register_exception(handle scope, const char *name,
                                             PyObject *base = PyExc_Exception) {
-    auto &ex = detail::get_exception_object<CppException>();
-    if (!ex) ex = exception<CppException>(scope, name, base);
+  auto &ex = detail::get_exception_object<CppException>();
+  if (!ex)
+    ex = exception<CppException>(scope, name, base);
 
-    register_exception_translator([](std::exception_ptr p) {
-        if (!p) return;
-        try {
-            std::rethrow_exception(p);
-        } catch (const CppException &e) {
-            detail::get_exception_object<CppException>()(e.what());
-        }
-    });
-    return ex;
+  register_exception_translator([](std::exception_ptr p) {
+    if (!p)
+      return;
+    try {
+      std::rethrow_exception(p);
+    } catch (const CppException &e) {
+      detail::get_exception_object<CppException>()(e.what());
+    }
+  });
+  return ex;
 }
 
 NAMESPACE_BEGIN(detail)
 PYBIND11_NOINLINE inline void print(tuple args, dict kwargs) {
-    auto strings = tuple(args.size());
-    for (size_t i = 0; i < args.size(); ++i) {
-        strings[i] = str(args[i]);
+  auto strings = tuple(args.size());
+  for (size_t i = 0; i < args.size(); ++i) {
+    strings[i] = str(args[i]);
+  }
+  auto sep = kwargs.contains("sep") ? kwargs["sep"] : cast(" ");
+  auto line = sep.attr("join")(strings);
+
+  object file;
+  if (kwargs.contains("file")) {
+    file = kwargs["file"].cast<object>();
+  } else {
+    try {
+      file = module::import("sys").attr("stdout");
+    } catch (const error_already_set &) {
+      /* If print() is called from code that is executed as
+         part of garbage collection during interpreter shutdown,
+         importing 'sys' can fail. Give up rather than crashing the
+         interpreter in this case. */
+      return;
     }
-    auto sep = kwargs.contains("sep") ? kwargs["sep"] : cast(" ");
-    auto line = sep.attr("join")(strings);
+  }
 
-    object file;
-    if (kwargs.contains("file")) {
-        file = kwargs["file"].cast<object>();
-    } else {
-        try {
-            file = module::import("sys").attr("stdout");
-        } catch (const error_already_set &) {
-            /* If print() is called from code that is executed as
-               part of garbage collection during interpreter shutdown,
-               importing 'sys' can fail. Give up rather than crashing the
-               interpreter in this case. */
-            return;
-        }
-    }
+  auto write = file.attr("write");
+  write(line);
+  write(kwargs.contains("end") ? kwargs["end"] : cast("\n"));
 
-    auto write = file.attr("write");
-    write(line);
-    write(kwargs.contains("end") ? kwargs["end"] : cast("\n"));
-
-    if (kwargs.contains("flush") && kwargs["flush"].cast<bool>())
-        file.attr("flush")();
+  if (kwargs.contains("flush") && kwargs["flush"].cast<bool>())
+    file.attr("flush")();
 }
 NAMESPACE_END(detail)
 
-template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
-void print(Args &&...args) {
-    auto c = detail::collect_arguments<policy>(std::forward<Args>(args)...);
-    detail::print(c.args(), c.kwargs());
+template <return_value_policy policy = return_value_policy::automatic_reference,
+          typename... Args>
+void print(Args &&... args) {
+  auto c = detail::collect_arguments<policy>(std::forward<Args>(args)...);
+  detail::print(c.args(), c.kwargs());
 }
 
 #if defined(WITH_THREAD) && !defined(PYPY_VERSION)
@@ -1878,223 +2137,237 @@ void print(Args &&...args) {
 
 class gil_scoped_acquire {
 public:
-    PYBIND11_NOINLINE gil_scoped_acquire() {
-        auto const &internals = detail::get_internals();
-        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
+  PYBIND11_NOINLINE gil_scoped_acquire() {
+    auto const &internals = detail::get_internals();
+    tstate = (PyThreadState *)PYBIND11_TLS_GET_VALUE(internals.tstate);
 
-        if (!tstate) {
-            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
-               calling from a Python thread). Since we use a different key, this ensures
-               we don't create a new thread state and deadlock in PyEval_AcquireThread
-               below. Note we don't save this state with internals.tstate, since we don't
-               create it we would fail to clear it (its reference count should be > 0). */
-            tstate = PyGILState_GetThisThreadState();
-        }
-
-        if (!tstate) {
-            tstate = PyThreadState_New(internals.istate);
-            #if !defined(NDEBUG)
-                if (!tstate)
-                    pybind11_fail("scoped_acquire: could not create thread state!");
-            #endif
-            tstate->gilstate_counter = 0;
-            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
-        } else {
-            release = detail::get_thread_state_unchecked() != tstate;
-        }
-
-        if (release) {
-            /* Work around an annoying assertion in PyThreadState_Swap */
-            #if defined(Py_DEBUG)
-                PyInterpreterState *interp = tstate->interp;
-                tstate->interp = nullptr;
-            #endif
-            PyEval_AcquireThread(tstate);
-            #if defined(Py_DEBUG)
-                tstate->interp = interp;
-            #endif
-        }
-
-        inc_ref();
+    if (!tstate) {
+      /* Check if the GIL was acquired using the PyGILState_* API instead (e.g.
+         if calling from a Python thread). Since we use a different key, this
+         ensures we don't create a new thread state and deadlock in
+         PyEval_AcquireThread below. Note we don't save this state with
+         internals.tstate, since we don't create it we would fail to clear it
+         (its reference count should be > 0). */
+      tstate = PyGILState_GetThisThreadState();
     }
 
-    void inc_ref() {
-        ++tstate->gilstate_counter;
+    if (!tstate) {
+      tstate = PyThreadState_New(internals.istate);
+#if !defined(NDEBUG)
+      if (!tstate)
+        pybind11_fail("scoped_acquire: could not create thread state!");
+#endif
+      tstate->gilstate_counter = 0;
+      PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
+    } else {
+      release = detail::get_thread_state_unchecked() != tstate;
     }
 
-    PYBIND11_NOINLINE void dec_ref() {
-        --tstate->gilstate_counter;
-        #if !defined(NDEBUG)
-            if (detail::get_thread_state_unchecked() != tstate)
-                pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
-            if (tstate->gilstate_counter < 0)
-                pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
-        #endif
-        if (tstate->gilstate_counter == 0) {
-            #if !defined(NDEBUG)
-                if (!release)
-                    pybind11_fail("scoped_acquire::dec_ref(): internal error!");
-            #endif
-            PyThreadState_Clear(tstate);
-            PyThreadState_DeleteCurrent();
-            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
-            release = false;
-        }
+    if (release) {
+/* Work around an annoying assertion in PyThreadState_Swap */
+#if defined(Py_DEBUG)
+      PyInterpreterState *interp = tstate->interp;
+      tstate->interp = nullptr;
+#endif
+      PyEval_AcquireThread(tstate);
+#if defined(Py_DEBUG)
+      tstate->interp = interp;
+#endif
     }
 
-    PYBIND11_NOINLINE ~gil_scoped_acquire() {
-        dec_ref();
-        if (release)
-           PyEval_SaveThread();
+    inc_ref();
+  }
+
+  void inc_ref() { ++tstate->gilstate_counter; }
+
+  PYBIND11_NOINLINE void dec_ref() {
+    --tstate->gilstate_counter;
+#if !defined(NDEBUG)
+    if (detail::get_thread_state_unchecked() != tstate)
+      pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
+    if (tstate->gilstate_counter < 0)
+      pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
+#endif
+    if (tstate->gilstate_counter == 0) {
+#if !defined(NDEBUG)
+      if (!release)
+        pybind11_fail("scoped_acquire::dec_ref(): internal error!");
+#endif
+      PyThreadState_Clear(tstate);
+      PyThreadState_DeleteCurrent();
+      PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
+      release = false;
     }
+  }
+
+  PYBIND11_NOINLINE ~gil_scoped_acquire() {
+    dec_ref();
+    if (release)
+      PyEval_SaveThread();
+  }
+
 private:
-    PyThreadState *tstate = nullptr;
-    bool release = true;
+  PyThreadState *tstate = nullptr;
+  bool release = true;
 };
 
 class gil_scoped_release {
 public:
-    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
-        // `get_internals()` must be called here unconditionally in order to initialize
-        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
-        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
-        const auto &internals = detail::get_internals();
-        tstate = PyEval_SaveThread();
-        if (disassoc) {
-            auto key = internals.tstate;
-            PYBIND11_TLS_DELETE_VALUE(key);
-        }
+  explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+    // `get_internals()` must be called here unconditionally in order to
+    // initialize `internals.tstate` for subsequent `gil_scoped_acquire` calls.
+    // Otherwise, an initialization race could occur as multiple threads try
+    // `gil_scoped_acquire`.
+    const auto &internals = detail::get_internals();
+    tstate = PyEval_SaveThread();
+    if (disassoc) {
+      auto key = internals.tstate;
+      PYBIND11_TLS_DELETE_VALUE(key);
     }
-    ~gil_scoped_release() {
-        if (!tstate)
-            return;
-        PyEval_RestoreThread(tstate);
-        if (disassoc) {
-            auto key = detail::get_internals().tstate;
-            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
-        }
+  }
+  ~gil_scoped_release() {
+    if (!tstate)
+      return;
+    PyEval_RestoreThread(tstate);
+    if (disassoc) {
+      auto key = detail::get_internals().tstate;
+      PYBIND11_TLS_REPLACE_VALUE(key, tstate);
     }
+  }
+
 private:
-    PyThreadState *tstate;
-    bool disassoc;
+  PyThreadState *tstate;
+  bool disassoc;
 };
 #elif defined(PYPY_VERSION)
 class gil_scoped_acquire {
-    PyGILState_STATE state;
+  PyGILState_STATE state;
+
 public:
-    gil_scoped_acquire() { state = PyGILState_Ensure(); }
-    ~gil_scoped_acquire() { PyGILState_Release(state); }
+  gil_scoped_acquire() { state = PyGILState_Ensure(); }
+  ~gil_scoped_acquire() { PyGILState_Release(state); }
 };
 
 class gil_scoped_release {
-    PyThreadState *state;
+  PyThreadState *state;
+
 public:
-    gil_scoped_release() { state = PyEval_SaveThread(); }
-    ~gil_scoped_release() { PyEval_RestoreThread(state); }
+  gil_scoped_release() { state = PyEval_SaveThread(); }
+  ~gil_scoped_release() { PyEval_RestoreThread(state); }
 };
 #else
-class gil_scoped_acquire { };
-class gil_scoped_release { };
+class gil_scoped_acquire {};
+class gil_scoped_release {};
 #endif
 
 error_already_set::~error_already_set() {
-    if (m_type) {
-        error_scope scope;
-        gil_scoped_acquire gil;
-        m_type.release().dec_ref();
-        m_value.release().dec_ref();
-        m_trace.release().dec_ref();
-    }
+  if (m_type) {
+    error_scope scope;
+    gil_scoped_acquire gil;
+    m_type.release().dec_ref();
+    m_value.release().dec_ref();
+    m_trace.release().dec_ref();
+  }
 }
 
-inline function get_type_overload(const void *this_ptr, const detail::type_info *this_type, const char *name)  {
-    handle self = detail::get_object_handle(this_ptr, this_type);
-    if (!self)
-        return function();
-    handle type = self.get_type();
-    auto key = std::make_pair(type.ptr(), name);
+inline function get_type_overload(const void *this_ptr,
+                                  const detail::type_info *this_type,
+                                  const char *name) {
+  handle self = detail::get_object_handle(this_ptr, this_type);
+  if (!self)
+    return function();
+  handle type = self.get_type();
+  auto key = std::make_pair(type.ptr(), name);
 
-    /* Cache functions that aren't overloaded in Python to avoid
-       many costly Python dictionary lookups below */
-    auto &cache = detail::get_internals().inactive_overload_cache;
-    if (cache.find(key) != cache.end())
-        return function();
+  /* Cache functions that aren't overloaded in Python to avoid
+     many costly Python dictionary lookups below */
+  auto &cache = detail::get_internals().inactive_overload_cache;
+  if (cache.find(key) != cache.end())
+    return function();
 
-    function overload = getattr(self, name, function());
-    if (overload.is_cpp_function()) {
-        cache.insert(key);
-        return function();
-    }
+  function overload = getattr(self, name, function());
+  if (overload.is_cpp_function()) {
+    cache.insert(key);
+    return function();
+  }
 
-    /* Don't call dispatch code if invoked from overridden function.
-       Unfortunately this doesn't work on PyPy. */
+  /* Don't call dispatch code if invoked from overridden function.
+     Unfortunately this doesn't work on PyPy. */
 #if !defined(PYPY_VERSION)
-    PyFrameObject *frame = PyThreadState_Get()->frame;
-    if (frame && (std::string) str(frame->f_code->co_name) == name &&
-        frame->f_code->co_argcount > 0) {
-        PyFrame_FastToLocals(frame);
-        PyObject *self_caller = PyDict_GetItem(
-            frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0));
-        if (self_caller == self.ptr())
-            return function();
-    }
+  PyFrameObject *frame = PyThreadState_Get()->frame;
+  if (frame && (std::string)str(frame->f_code->co_name) == name &&
+      frame->f_code->co_argcount > 0) {
+    PyFrame_FastToLocals(frame);
+    PyObject *self_caller = PyDict_GetItem(
+        frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0));
+    if (self_caller == self.ptr())
+      return function();
+  }
 #else
-    /* PyPy currently doesn't provide a detailed cpyext emulation of
-       frame objects, so we have to emulate this using Python. This
-       is going to be slow..*/
-    dict d; d["self"] = self; d["name"] = pybind11::str(name);
-    PyObject *result = PyRun_String(
-        "import inspect\n"
-        "frame = inspect.currentframe()\n"
-        "if frame is not None:\n"
-        "    frame = frame.f_back\n"
-        "    if frame is not None and str(frame.f_code.co_name) == name and "
-        "frame.f_code.co_argcount > 0:\n"
-        "        self_caller = frame.f_locals[frame.f_code.co_varnames[0]]\n"
-        "        if self_caller == self:\n"
-        "            self = None\n",
-        Py_file_input, d.ptr(), d.ptr());
-    if (result == nullptr)
-        throw error_already_set();
-    if (d["self"].is_none())
-        return function();
-    Py_DECREF(result);
+  /* PyPy currently doesn't provide a detailed cpyext emulation of
+     frame objects, so we have to emulate this using Python. This
+     is going to be slow..*/
+  dict d;
+  d["self"] = self;
+  d["name"] = pybind11::str(name);
+  PyObject *result = PyRun_String(
+      "import inspect\n"
+      "frame = inspect.currentframe()\n"
+      "if frame is not None:\n"
+      "    frame = frame.f_back\n"
+      "    if frame is not None and str(frame.f_code.co_name) == name and "
+      "frame.f_code.co_argcount > 0:\n"
+      "        self_caller = frame.f_locals[frame.f_code.co_varnames[0]]\n"
+      "        if self_caller == self:\n"
+      "            self = None\n",
+      Py_file_input, d.ptr(), d.ptr());
+  if (result == nullptr)
+    throw error_already_set();
+  if (d["self"].is_none())
+    return function();
+  Py_DECREF(result);
 #endif
 
-    return overload;
+  return overload;
 }
 
 /** \rst
-  Try to retrieve a python method by the provided name from the instance pointed to by the this_ptr.
+  Try to retrieve a python method by the provided name from the instance pointed
+ to by the this_ptr.
 
-  :this_ptr: The pointer to the object the overload should be retrieved for. This should be the first
-                   non-trampoline class encountered in the inheritance chain.
-  :name: The name of the overloaded Python method to retrieve.
-  :return: The Python method by this name from the object or an empty function wrapper.
+  :this_ptr: The pointer to the object the overload should be retrieved for.
+ This should be the first non-trampoline class encountered in the inheritance
+ chain. :name: The name of the overloaded Python method to retrieve. :return:
+ The Python method by this name from the object or an empty function wrapper.
  \endrst */
 template <class T> function get_overload(const T *this_ptr, const char *name) {
-    auto tinfo = detail::get_type_info(typeid(T));
-    return tinfo ? get_type_overload(this_ptr, tinfo, name) : function();
+  auto tinfo = detail::get_type_info(typeid(T));
+  return tinfo ? get_type_overload(this_ptr, tinfo, name) : function();
 }
 
-#define PYBIND11_OVERLOAD_INT(ret_type, cname, name, ...) { \
-        pybind11::gil_scoped_acquire gil; \
-        pybind11::function overload = pybind11::get_overload(static_cast<const cname *>(this), name); \
-        if (overload) { \
-            auto o = overload(__VA_ARGS__); \
-            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value) { \
-                static pybind11::detail::overload_caster_t<ret_type> caster; \
-                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster); \
-            } \
-            else return pybind11::detail::cast_safe<ret_type>(std::move(o)); \
-        } \
-    }
+#define PYBIND11_OVERLOAD_INT(ret_type, cname, name, ...)                      \
+  {                                                                            \
+    pybind11::gil_scoped_acquire gil;                                          \
+    pybind11::function overload =                                              \
+        pybind11::get_overload(static_cast<const cname *>(this), name);        \
+    if (overload) {                                                            \
+      auto o = overload(__VA_ARGS__);                                          \
+      if (pybind11::detail::cast_is_temporary_value_reference<                 \
+              ret_type>::value) {                                              \
+        static pybind11::detail::overload_caster_t<ret_type> caster;           \
+        return pybind11::detail::cast_ref<ret_type>(std::move(o), caster);     \
+      } else                                                                   \
+        return pybind11::detail::cast_safe<ret_type>(std::move(o));            \
+    }                                                                          \
+  }
 
 /** \rst
-    Macro to populate the virtual method in the trampoline class. This macro tries to look up a method named 'fn'
-    from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return
-    the appropriate type. See :ref:`overriding_virtuals` for more information. This macro should be used when the method
-    name in C is not the same as the method name in Python. For example with `__str__`.
+    Macro to populate the virtual method in the trampoline class. This macro
+tries to look up a method named 'fn' from the Python side, deals with the
+:ref:`gil` and necessary argument conversions to call this method and return the
+appropriate type. See :ref:`overriding_virtuals` for more information. This
+macro should be used when the method name in C is not the same as the method
+name in Python. For example with `__str__`.
 
     .. code-block:: cpp
 
@@ -2107,23 +2380,28 @@ template <class T> function get_overload(const T *this_ptr, const char *name) {
         );
       }
 \endrst */
-#define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...) \
-    PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \
-    return cname::fn(__VA_ARGS__)
+#define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...)                 \
+  PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name,   \
+                        __VA_ARGS__)                                           \
+  return cname::fn(__VA_ARGS__)
 
 /** \rst
-    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD_NAME`, except that it
-    throws if no overload can be found.
-\endrst */
-#define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...) \
-    PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \
-    pybind11::pybind11_fail("Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\"");
+    Macro for pure virtual functions, this function is identical to
+:c:macro:`PYBIND11_OVERLOAD_NAME`, except that it throws if no overload can be
+found. \endrst */
+#define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...)            \
+  PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name,   \
+                        __VA_ARGS__)                                           \
+  pybind11::pybind11_fail(                                                     \
+      "Tried to call pure virtual function \"" PYBIND11_STRINGIFY(             \
+          cname) "::" name "\"");
 
 /** \rst
-    Macro to populate the virtual method in the trampoline class. This macro tries to look up the method
-    from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return
-    the appropriate type. This macro should be used if the method name in C and in Python are identical.
-    See :ref:`overriding_virtuals` for more information.
+    Macro to populate the virtual method in the trampoline class. This macro
+tries to look up the method from the Python side, deals with the :ref:`gil` and
+necessary argument conversions to call this method and return the appropriate
+type. This macro should be used if the method name in C and in Python are
+identical. See :ref:`overriding_virtuals` for more information.
 
     .. code-block:: cpp
 
@@ -2137,26 +2415,28 @@ template <class T> function get_overload(const T *this_ptr, const char *name) {
               PYBIND11_OVERLOAD_PURE(
                   std::string, // Return type (ret_type)
                   Animal,      // Parent class (cname)
-                  go,          // Name of function in C++ (must match Python name) (fn)
-                  n_times      // Argument(s) (...)
+                  go,          // Name of function in C++ (must match Python
+name) (fn) n_times      // Argument(s) (...)
               );
           }
       };
 \endrst */
-#define PYBIND11_OVERLOAD(ret_type, cname, fn, ...) \
-    PYBIND11_OVERLOAD_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+#define PYBIND11_OVERLOAD(ret_type, cname, fn, ...)                            \
+  PYBIND11_OVERLOAD_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn,   \
+                         fn, __VA_ARGS__)
 
 /** \rst
-    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD`, except that it throws
-    if no overload can be found.
+    Macro for pure virtual functions, this function is identical to
+:c:macro:`PYBIND11_OVERLOAD`, except that it throws if no overload can be found.
 \endrst */
-#define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...) \
-    PYBIND11_OVERLOAD_PURE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+#define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...)                       \
+  PYBIND11_OVERLOAD_PURE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname),   \
+                              #fn, fn, __VA_ARGS__)
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
 
 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#  pragma warning(pop)
+#pragma warning(pop)
 #elif defined(__GNUG__) && !defined(__clang__)
-#  pragma GCC diagnostic pop
+#pragma GCC diagnostic pop
 #endif
diff --git a/python/src/pybind11/pytypes.h b/python/src/pybind11/pytypes.h
index 2d573dfad..933757ad1 100644
--- a/python/src/pybind11/pytypes.h
+++ b/python/src/pybind11/pytypes.h
@@ -9,17 +9,20 @@
 
 #pragma once
 
-#include "detail/common.h"
 #include "buffer_info.h"
-#include <utility>
+#include "detail/common.h"
 #include <type_traits>
+#include <utility>
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
 /* A few forward declarations */
-class handle; class object;
-class str; class iterator;
-struct arg; struct arg_v;
+class handle;
+class object;
+class str;
+class iterator;
+struct arg;
+struct arg_v;
 
 NAMESPACE_BEGIN(detail)
 class args_proxy;
@@ -28,13 +31,13 @@ inline bool isinstance_generic(handle obj, const std::type_info &tp);
 // Accessor forward declarations
 template <typename Policy> class accessor;
 namespace accessor_policies {
-    struct obj_attr;
-    struct str_attr;
-    struct generic_item;
-    struct sequence_item;
-    struct list_item;
-    struct tuple_item;
-}
+struct obj_attr;
+struct str_attr;
+struct generic_item;
+struct sequence_item;
+struct list_item;
+struct tuple_item;
+} // namespace accessor_policies
 using obj_attr_accessor = accessor<accessor_policies::obj_attr>;
 using str_attr_accessor = accessor<accessor_policies::str_attr>;
 using item_accessor = accessor<accessor_policies::generic_item>;
@@ -43,119 +46,139 @@ using list_accessor = accessor<accessor_policies::list_item>;
 using tuple_accessor = accessor<accessor_policies::tuple_item>;
 
 /// Tag and check to identify a class which implements the Python object API
-class pyobject_tag { };
-template <typename T> using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
+class pyobject_tag {};
+template <typename T>
+using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
 
 /** \rst
-    A mixin class which adds common functions to `handle`, `object` and various accessors.
-    The only requirement for `Derived` is to implement ``PyObject *Derived::ptr() const``.
-\endrst */
-template <typename Derived>
-class object_api : public pyobject_tag {
-    const Derived &derived() const { return static_cast<const Derived &>(*this); }
+    A mixin class which adds common functions to `handle`, `object` and various
+accessors. The only requirement for `Derived` is to implement ``PyObject
+*Derived::ptr() const``. \endrst */
+template <typename Derived> class object_api : public pyobject_tag {
+  const Derived &derived() const { return static_cast<const Derived &>(*this); }
 
 public:
-    /** \rst
-        Return an iterator equivalent to calling ``iter()`` in Python. The object
-        must be a collection which supports the iteration protocol.
-    \endrst */
-    iterator begin() const;
-    /// Return a sentinel which ends iteration.
-    iterator end() const;
+  /** \rst
+      Return an iterator equivalent to calling ``iter()`` in Python. The object
+      must be a collection which supports the iteration protocol.
+  \endrst */
+  iterator begin() const;
+  /// Return a sentinel which ends iteration.
+  iterator end() const;
 
-    /** \rst
-        Return an internal functor to invoke the object's sequence protocol. Casting
-        the returned ``detail::item_accessor`` instance to a `handle` or `object`
-        subclass causes a corresponding call to ``__getitem__``. Assigning a `handle`
-        or `object` subclass causes a call to ``__setitem__``.
-    \endrst */
-    item_accessor operator[](handle key) const;
-    /// See above (the only difference is that they key is provided as a string literal)
-    item_accessor operator[](const char *key) const;
+  /** \rst
+      Return an internal functor to invoke the object's sequence protocol.
+  Casting the returned ``detail::item_accessor`` instance to a `handle` or
+  `object` subclass causes a corresponding call to ``__getitem__``. Assigning a
+  `handle` or `object` subclass causes a call to ``__setitem__``. \endrst */
+  item_accessor operator[](handle key) const;
+  /// See above (the only difference is that they key is provided as a string
+  /// literal)
+  item_accessor operator[](const char *key) const;
 
-    /** \rst
-        Return an internal functor to access the object's attributes. Casting the
-        returned ``detail::obj_attr_accessor`` instance to a `handle` or `object`
-        subclass causes a corresponding call to ``getattr``. Assigning a `handle`
-        or `object` subclass causes a call to ``setattr``.
-    \endrst */
-    obj_attr_accessor attr(handle key) const;
-    /// See above (the only difference is that they key is provided as a string literal)
-    str_attr_accessor attr(const char *key) const;
+  /** \rst
+      Return an internal functor to access the object's attributes. Casting the
+      returned ``detail::obj_attr_accessor`` instance to a `handle` or `object`
+      subclass causes a corresponding call to ``getattr``. Assigning a `handle`
+      or `object` subclass causes a call to ``setattr``.
+  \endrst */
+  obj_attr_accessor attr(handle key) const;
+  /// See above (the only difference is that they key is provided as a string
+  /// literal)
+  str_attr_accessor attr(const char *key) const;
 
-    /** \rst
-        Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple``
-        or ``list`` for a function call. Applying another * to the result yields
-        ** unpacking, e.g. to unpack a dict as function keyword arguments.
-        See :ref:`calling_python_functions`.
-    \endrst */
-    args_proxy operator*() const;
+  /** \rst
+      Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple``
+      or ``list`` for a function call. Applying another * to the result yields
+      ** unpacking, e.g. to unpack a dict as function keyword arguments.
+      See :ref:`calling_python_functions`.
+  \endrst */
+  args_proxy operator*() const;
 
-    /// Check if the given item is contained within this object, i.e. ``item in obj``.
-    template <typename T> bool contains(T &&item) const;
+  /// Check if the given item is contained within this object, i.e. ``item in
+  /// obj``.
+  template <typename T> bool contains(T &&item) const;
 
-    /** \rst
-        Assuming the Python object is a function or implements the ``__call__``
-        protocol, ``operator()`` invokes the underlying function, passing an
-        arbitrary set of parameters. The result is returned as a `object` and
-        may need to be converted back into a Python object using `handle::cast()`.
+  /** \rst
+      Assuming the Python object is a function or implements the ``__call__``
+      protocol, ``operator()`` invokes the underlying function, passing an
+      arbitrary set of parameters. The result is returned as a `object` and
+      may need to be converted back into a Python object using `handle::cast()`.
 
-        When some of the arguments cannot be converted to Python objects, the
-        function will throw a `cast_error` exception. When the Python function
-        call fails, a `error_already_set` exception is thrown.
-    \endrst */
-    template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
-    object operator()(Args &&...args) const;
-    template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
-    PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)")
-        object call(Args&&... args) const;
+      When some of the arguments cannot be converted to Python objects, the
+      function will throw a `cast_error` exception. When the Python function
+      call fails, a `error_already_set` exception is thrown.
+  \endrst */
+  template <
+      return_value_policy policy = return_value_policy::automatic_reference,
+      typename... Args>
+  object operator()(Args &&... args) const;
+  template <
+      return_value_policy policy = return_value_policy::automatic_reference,
+      typename... Args>
+  PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)")
+  object call(Args &&... args) const;
 
-    /// Equivalent to ``obj is other`` in Python.
-    bool is(object_api const& other) const { return derived().ptr() == other.derived().ptr(); }
-    /// Equivalent to ``obj is None`` in Python.
-    bool is_none() const { return derived().ptr() == Py_None; }
-    /// Equivalent to obj == other in Python
-    bool equal(object_api const &other) const      { return rich_compare(other, Py_EQ); }
-    bool not_equal(object_api const &other) const  { return rich_compare(other, Py_NE); }
-    bool operator<(object_api const &other) const  { return rich_compare(other, Py_LT); }
-    bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); }
-    bool operator>(object_api const &other) const  { return rich_compare(other, Py_GT); }
-    bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); }
+  /// Equivalent to ``obj is other`` in Python.
+  bool is(object_api const &other) const {
+    return derived().ptr() == other.derived().ptr();
+  }
+  /// Equivalent to ``obj is None`` in Python.
+  bool is_none() const { return derived().ptr() == Py_None; }
+  /// Equivalent to obj == other in Python
+  bool equal(object_api const &other) const {
+    return rich_compare(other, Py_EQ);
+  }
+  bool not_equal(object_api const &other) const {
+    return rich_compare(other, Py_NE);
+  }
+  bool operator<(object_api const &other) const {
+    return rich_compare(other, Py_LT);
+  }
+  bool operator<=(object_api const &other) const {
+    return rich_compare(other, Py_LE);
+  }
+  bool operator>(object_api const &other) const {
+    return rich_compare(other, Py_GT);
+  }
+  bool operator>=(object_api const &other) const {
+    return rich_compare(other, Py_GE);
+  }
 
-    object operator-() const;
-    object operator~() const;
-    object operator+(object_api const &other) const;
-    object operator+=(object_api const &other) const;
-    object operator-(object_api const &other) const;
-    object operator-=(object_api const &other) const;
-    object operator*(object_api const &other) const;
-    object operator*=(object_api const &other) const;
-    object operator/(object_api const &other) const;
-    object operator/=(object_api const &other) const;
-    object operator|(object_api const &other) const;
-    object operator|=(object_api const &other) const;
-    object operator&(object_api const &other) const;
-    object operator&=(object_api const &other) const;
-    object operator^(object_api const &other) const;
-    object operator^=(object_api const &other) const;
-    object operator<<(object_api const &other) const;
-    object operator<<=(object_api const &other) const;
-    object operator>>(object_api const &other) const;
-    object operator>>=(object_api const &other) const;
+  object operator-() const;
+  object operator~() const;
+  object operator+(object_api const &other) const;
+  object operator+=(object_api const &other) const;
+  object operator-(object_api const &other) const;
+  object operator-=(object_api const &other) const;
+  object operator*(object_api const &other) const;
+  object operator*=(object_api const &other) const;
+  object operator/(object_api const &other) const;
+  object operator/=(object_api const &other) const;
+  object operator|(object_api const &other) const;
+  object operator|=(object_api const &other) const;
+  object operator&(object_api const &other) const;
+  object operator&=(object_api const &other) const;
+  object operator^(object_api const &other) const;
+  object operator^=(object_api const &other) const;
+  object operator<<(object_api const &other) const;
+  object operator<<=(object_api const &other) const;
+  object operator>>(object_api const &other) const;
+  object operator>>=(object_api const &other) const;
 
-    PYBIND11_DEPRECATED("Use py::str(obj) instead")
-    pybind11::str str() const;
+  PYBIND11_DEPRECATED("Use py::str(obj) instead")
+  pybind11::str str() const;
 
-    /// Get or set the object's docstring, i.e. ``obj.__doc__``.
-    str_attr_accessor doc() const;
+  /// Get or set the object's docstring, i.e. ``obj.__doc__``.
+  str_attr_accessor doc() const;
 
-    /// Return the object's current reference count
-    int ref_count() const { return static_cast<int>(Py_REFCNT(derived().ptr())); }
-    /// Return a handle to the Python type object underlying the instance
-    handle get_type() const;
+  /// Return the object's current reference count
+  int ref_count() const { return static_cast<int>(Py_REFCNT(derived().ptr())); }
+  /// Return a handle to the Python type object underlying the instance
+  handle get_type() const;
 
 private:
-    bool rich_compare(object_api const &other, int value) const;
+  bool rich_compare(object_api const &other, int value) const;
 };
 
 NAMESPACE_END(detail)
@@ -163,9 +186,11 @@ NAMESPACE_END(detail)
 /** \rst
     Holds a reference to a Python object (no reference counting)
 
-    The `handle` class is a thin wrapper around an arbitrary Python object (i.e. a
-    ``PyObject *`` in Python's C API). It does not perform any automatic reference
-    counting and merely provides a basic C++ interface to various Python API functions.
+    The `handle` class is a thin wrapper around an arbitrary Python object (i.e.
+a
+    ``PyObject *`` in Python's C API). It does not perform any automatic
+reference counting and merely provides a basic C++ interface to various Python
+API functions.
 
     .. seealso::
         The `object` class inherits from `handle` and adds automatic reference
@@ -173,133 +198,150 @@ NAMESPACE_END(detail)
 \endrst */
 class handle : public detail::object_api<handle> {
 public:
-    /// The default constructor creates a handle with a ``nullptr``-valued pointer
-    handle() = default;
-    /// Creates a ``handle`` from the given raw Python object pointer
-    handle(PyObject *ptr) : m_ptr(ptr) { } // Allow implicit conversion from PyObject*
+  /// The default constructor creates a handle with a ``nullptr``-valued pointer
+  handle() = default;
+  /// Creates a ``handle`` from the given raw Python object pointer
+  handle(PyObject *ptr)
+      : m_ptr(ptr) {} // Allow implicit conversion from PyObject*
 
-    /// Return the underlying ``PyObject *`` pointer
-    PyObject *ptr() const { return m_ptr; }
-    PyObject *&ptr() { return m_ptr; }
+  /// Return the underlying ``PyObject *`` pointer
+  PyObject *ptr() const { return m_ptr; }
+  PyObject *&ptr() { return m_ptr; }
 
-    /** \rst
-        Manually increase the reference count of the Python object. Usually, it is
-        preferable to use the `object` class which derives from `handle` and calls
-        this function automatically. Returns a reference to itself.
-    \endrst */
-    const handle& inc_ref() const & { Py_XINCREF(m_ptr); return *this; }
+  /** \rst
+      Manually increase the reference count of the Python object. Usually, it is
+      preferable to use the `object` class which derives from `handle` and calls
+      this function automatically. Returns a reference to itself.
+  \endrst */
+  const handle &inc_ref() const & {
+    Py_XINCREF(m_ptr);
+    return *this;
+  }
 
-    /** \rst
-        Manually decrease the reference count of the Python object. Usually, it is
-        preferable to use the `object` class which derives from `handle` and calls
-        this function automatically. Returns a reference to itself.
-    \endrst */
-    const handle& dec_ref() const & { Py_XDECREF(m_ptr); return *this; }
+  /** \rst
+      Manually decrease the reference count of the Python object. Usually, it is
+      preferable to use the `object` class which derives from `handle` and calls
+      this function automatically. Returns a reference to itself.
+  \endrst */
+  const handle &dec_ref() const & {
+    Py_XDECREF(m_ptr);
+    return *this;
+  }
+
+  /** \rst
+      Attempt to cast the Python object into the given C++ type. A `cast_error`
+      will be throw upon failure.
+  \endrst */
+  template <typename T> T cast() const;
+  /// Return ``true`` when the `handle` wraps a valid Python object
+  explicit operator bool() const { return m_ptr != nullptr; }
+  /** \rst
+      Deprecated: Check that the underlying pointers are the same.
+      Equivalent to ``obj1 is obj2`` in Python.
+  \endrst */
+  PYBIND11_DEPRECATED("Use obj1.is(obj2) instead")
+  bool operator==(const handle &h) const { return m_ptr == h.m_ptr; }
+  PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead")
+  bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; }
+  PYBIND11_DEPRECATED("Use handle::operator bool() instead")
+  bool check() const { return m_ptr != nullptr; }
 
-    /** \rst
-        Attempt to cast the Python object into the given C++ type. A `cast_error`
-        will be throw upon failure.
-    \endrst */
-    template <typename T> T cast() const;
-    /// Return ``true`` when the `handle` wraps a valid Python object
-    explicit operator bool() const { return m_ptr != nullptr; }
-    /** \rst
-        Deprecated: Check that the underlying pointers are the same.
-        Equivalent to ``obj1 is obj2`` in Python.
-    \endrst */
-    PYBIND11_DEPRECATED("Use obj1.is(obj2) instead")
-    bool operator==(const handle &h) const { return m_ptr == h.m_ptr; }
-    PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead")
-    bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; }
-    PYBIND11_DEPRECATED("Use handle::operator bool() instead")
-    bool check() const { return m_ptr != nullptr; }
 protected:
-    PyObject *m_ptr = nullptr;
+  PyObject *m_ptr = nullptr;
 };
 
 /** \rst
     Holds a reference to a Python object (with reference counting)
 
-    Like `handle`, the `object` class is a thin wrapper around an arbitrary Python
-    object (i.e. a ``PyObject *`` in Python's C API). In contrast to `handle`, it
-    optionally increases the object's reference count upon construction, and it
-    *always* decreases the reference count when the `object` instance goes out of
-    scope and is destructed. When using `object` instances consistently, it is much
-    easier to get reference counting right at the first attempt.
-\endrst */
+    Like `handle`, the `object` class is a thin wrapper around an arbitrary
+Python object (i.e. a ``PyObject *`` in Python's C API). In contrast to
+`handle`, it optionally increases the object's reference count upon
+construction, and it *always* decreases the reference count when the `object`
+instance goes out of scope and is destructed. When using `object` instances
+consistently, it is much easier to get reference counting right at the first
+attempt. \endrst */
 class object : public handle {
 public:
-    object() = default;
-    PYBIND11_DEPRECATED("Use reinterpret_borrow<object>() or reinterpret_steal<object>()")
-    object(handle h, bool is_borrowed) : handle(h) { if (is_borrowed) inc_ref(); }
-    /// Copy constructor; always increases the reference count
-    object(const object &o) : handle(o) { inc_ref(); }
-    /// Move constructor; steals the object from ``other`` and preserves its reference count
-    object(object &&other) noexcept { m_ptr = other.m_ptr; other.m_ptr = nullptr; }
-    /// Destructor; automatically calls `handle::dec_ref()`
-    ~object() { dec_ref(); }
+  object() = default;
+  PYBIND11_DEPRECATED(
+      "Use reinterpret_borrow<object>() or reinterpret_steal<object>()")
+  object(handle h, bool is_borrowed) : handle(h) {
+    if (is_borrowed)
+      inc_ref();
+  }
+  /// Copy constructor; always increases the reference count
+  object(const object &o) : handle(o) { inc_ref(); }
+  /// Move constructor; steals the object from ``other`` and preserves its
+  /// reference count
+  object(object &&other) noexcept {
+    m_ptr = other.m_ptr;
+    other.m_ptr = nullptr;
+  }
+  /// Destructor; automatically calls `handle::dec_ref()`
+  ~object() { dec_ref(); }
 
-    /** \rst
-        Resets the internal pointer to ``nullptr`` without without decreasing the
-        object's reference count. The function returns a raw handle to the original
-        Python object.
-    \endrst */
-    handle release() {
-      PyObject *tmp = m_ptr;
-      m_ptr = nullptr;
-      return handle(tmp);
+  /** \rst
+      Resets the internal pointer to ``nullptr`` without without decreasing the
+      object's reference count. The function returns a raw handle to the
+  original Python object. \endrst */
+  handle release() {
+    PyObject *tmp = m_ptr;
+    m_ptr = nullptr;
+    return handle(tmp);
+  }
+
+  object &operator=(const object &other) {
+    other.inc_ref();
+    dec_ref();
+    m_ptr = other.m_ptr;
+    return *this;
+  }
+
+  object &operator=(object &&other) noexcept {
+    if (this != &other) {
+      handle temp(m_ptr);
+      m_ptr = other.m_ptr;
+      other.m_ptr = nullptr;
+      temp.dec_ref();
     }
+    return *this;
+  }
 
-    object& operator=(const object &other) {
-        other.inc_ref();
-        dec_ref();
-        m_ptr = other.m_ptr;
-        return *this;
-    }
-
-    object& operator=(object &&other) noexcept {
-        if (this != &other) {
-            handle temp(m_ptr);
-            m_ptr = other.m_ptr;
-            other.m_ptr = nullptr;
-            temp.dec_ref();
-        }
-        return *this;
-    }
-
-    // Calling cast() on an object lvalue just copies (via handle::cast)
-    template <typename T> T cast() const &;
-    // Calling on an object rvalue does a move, if needed and/or possible
-    template <typename T> T cast() &&;
+  // Calling cast() on an object lvalue just copies (via handle::cast)
+  template <typename T> T cast() const &;
+  // Calling on an object rvalue does a move, if needed and/or possible
+  template <typename T> T cast() &&;
 
 protected:
-    // Tags for choosing constructors from raw PyObject *
-    struct borrowed_t { };
-    struct stolen_t { };
+  // Tags for choosing constructors from raw PyObject *
+  struct borrowed_t {};
+  struct stolen_t {};
 
-    template <typename T> friend T reinterpret_borrow(handle);
-    template <typename T> friend T reinterpret_steal(handle);
+  template <typename T> friend T reinterpret_borrow(handle);
+  template <typename T> friend T reinterpret_steal(handle);
 
 public:
-    // Only accessible from derived classes and the reinterpret_* functions
-    object(handle h, borrowed_t) : handle(h) { inc_ref(); }
-    object(handle h, stolen_t) : handle(h) { }
+  // Only accessible from derived classes and the reinterpret_* functions
+  object(handle h, borrowed_t) : handle(h) { inc_ref(); }
+  object(handle h, stolen_t) : handle(h) {}
 };
 
 /** \rst
-    Declare that a `handle` or ``PyObject *`` is a certain type and borrow the reference.
-    The target type ``T`` must be `object` or one of its derived classes. The function
-    doesn't do any conversions or checks. It's up to the user to make sure that the
-    target type is correct.
+    Declare that a `handle` or ``PyObject *`` is a certain type and borrow the
+reference. The target type ``T`` must be `object` or one of its derived classes.
+The function doesn't do any conversions or checks. It's up to the user to make
+sure that the target type is correct.
 
     .. code-block:: cpp
 
         PyObject *p = PyList_GetItem(obj, index);
         py::object o = reinterpret_borrow<py::object>(p);
         // or
-        py::tuple t = reinterpret_borrow<py::tuple>(p); // <-- `p` must be already be a `tuple`
-\endrst */
-template <typename T> T reinterpret_borrow(handle h) { return {h, object::borrowed_t{}}; }
+        py::tuple t = reinterpret_borrow<py::tuple>(p); // <-- `p` must be
+already be a `tuple` \endrst */
+template <typename T> T reinterpret_borrow(handle h) {
+  return {h, object::borrowed_t{}};
+}
 
 /** \rst
     Like `reinterpret_borrow`, but steals the reference.
@@ -307,51 +349,59 @@ template <typename T> T reinterpret_borrow(handle h) { return {h, object::borrow
      .. code-block:: cpp
 
         PyObject *p = PyObject_Str(obj);
-        py::str s = reinterpret_steal<py::str>(p); // <-- `p` must be already be a `str`
-\endrst */
-template <typename T> T reinterpret_steal(handle h) { return {h, object::stolen_t{}}; }
+        py::str s = reinterpret_steal<py::str>(p); // <-- `p` must be already be
+a `str` \endrst */
+template <typename T> T reinterpret_steal(handle h) {
+  return {h, object::stolen_t{}};
+}
 
 NAMESPACE_BEGIN(detail)
 inline std::string error_string();
 NAMESPACE_END(detail)
 
-/// Fetch and hold an error which was already set in Python.  An instance of this is typically
-/// thrown to propagate python-side errors back through C++ which can either be caught manually or
-/// else falls back to the function dispatcher (which then raises the captured error back to
-/// python).
+/// Fetch and hold an error which was already set in Python.  An instance of
+/// this is typically thrown to propagate python-side errors back through C++
+/// which can either be caught manually or else falls back to the function
+/// dispatcher (which then raises the captured error back to python).
 class error_already_set : public std::runtime_error {
 public:
-    /// Constructs a new exception from the current Python error indicator, if any.  The current
-    /// Python error indicator will be cleared.
-    error_already_set() : std::runtime_error(detail::error_string()) {
-        PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
-    }
+  /// Constructs a new exception from the current Python error indicator, if
+  /// any.  The current Python error indicator will be cleared.
+  error_already_set() : std::runtime_error(detail::error_string()) {
+    PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+  }
 
-    error_already_set(const error_already_set &) = default;
-    error_already_set(error_already_set &&) = default;
+  error_already_set(const error_already_set &) = default;
+  error_already_set(error_already_set &&) = default;
 
-    inline ~error_already_set();
+  inline ~error_already_set();
 
-    /// Give the currently-held error back to Python, if any.  If there is currently a Python error
-    /// already set it is cleared first.  After this call, the current object no longer stores the
-    /// error variables (but the `.what()` string is still available).
-    void restore() { PyErr_Restore(m_type.release().ptr(), m_value.release().ptr(), m_trace.release().ptr()); }
+  /// Give the currently-held error back to Python, if any.  If there is
+  /// currently a Python error already set it is cleared first.  After this
+  /// call, the current object no longer stores the error variables (but the
+  /// `.what()` string is still available).
+  void restore() {
+    PyErr_Restore(m_type.release().ptr(), m_value.release().ptr(),
+                  m_trace.release().ptr());
+  }
 
-    // Does nothing; provided for backwards compatibility.
-    PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated")
-    void clear() {}
+  // Does nothing; provided for backwards compatibility.
+  PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated")
+  void clear() {}
 
-    /// Check if the currently trapped error type matches the given Python exception class (or a
-    /// subclass thereof).  May also be passed a tuple to search for any exception class matches in
-    /// the given tuple.
-    bool matches(handle exc) const { return PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()); }
+  /// Check if the currently trapped error type matches the given Python
+  /// exception class (or a subclass thereof).  May also be passed a tuple to
+  /// search for any exception class matches in the given tuple.
+  bool matches(handle exc) const {
+    return PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr());
+  }
 
-    const object& type() const { return m_type; }
-    const object& value() const { return m_value; }
-    const object& trace() const { return m_trace; }
+  const object &type() const { return m_type; }
+  const object &value() const { return m_value; }
+  const object &trace() const { return m_trace; }
 
 private:
-    object m_type, m_value, m_trace;
+  object m_type, m_value, m_trace;
 };
 
 /** \defgroup python_builtins _
@@ -361,378 +411,481 @@ private:
 
 /** \ingroup python_builtins
     \rst
-    Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a subclass of
-    `object` or a class which was exposed to Python as ``py::class_<T>``.
-\endrst */
-template <typename T, detail::enable_if_t<std::is_base_of<object, T>::value, int> = 0>
-bool isinstance(handle obj) { return T::check_(obj); }
+    Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a
+subclass of `object` or a class which was exposed to Python as
+``py::class_<T>``. \endrst */
+template <typename T,
+          detail::enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) {
+  return T::check_(obj);
+}
 
-template <typename T, detail::enable_if_t<!std::is_base_of<object, T>::value, int> = 0>
-bool isinstance(handle obj) { return detail::isinstance_generic(obj, typeid(T)); }
+template <typename T,
+          detail::enable_if_t<!std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) {
+  return detail::isinstance_generic(obj, typeid(T));
+}
 
 template <> inline bool isinstance<handle>(handle obj) = delete;
-template <> inline bool isinstance<object>(handle obj) { return obj.ptr() != nullptr; }
+template <> inline bool isinstance<object>(handle obj) {
+  return obj.ptr() != nullptr;
+}
 
 /// \ingroup python_builtins
 /// Return true if ``obj`` is an instance of the ``type``.
 inline bool isinstance(handle obj, handle type) {
-    const auto result = PyObject_IsInstance(obj.ptr(), type.ptr());
-    if (result == -1)
-        throw error_already_set();
-    return result != 0;
+  const auto result = PyObject_IsInstance(obj.ptr(), type.ptr());
+  if (result == -1)
+    throw error_already_set();
+  return result != 0;
 }
 
 /// \addtogroup python_builtins
 /// @{
 inline bool hasattr(handle obj, handle name) {
-    return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1;
+  return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1;
 }
 
 inline bool hasattr(handle obj, const char *name) {
-    return PyObject_HasAttrString(obj.ptr(), name) == 1;
+  return PyObject_HasAttrString(obj.ptr(), name) == 1;
 }
 
 inline void delattr(handle obj, handle name) {
-    if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) { throw error_already_set(); }
+  if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) {
+    throw error_already_set();
+  }
 }
 
 inline void delattr(handle obj, const char *name) {
-    if (PyObject_DelAttrString(obj.ptr(), name) != 0) { throw error_already_set(); }
+  if (PyObject_DelAttrString(obj.ptr(), name) != 0) {
+    throw error_already_set();
+  }
 }
 
 inline object getattr(handle obj, handle name) {
-    PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr());
-    if (!result) { throw error_already_set(); }
-    return reinterpret_steal<object>(result);
+  PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr());
+  if (!result) {
+    throw error_already_set();
+  }
+  return reinterpret_steal<object>(result);
 }
 
 inline object getattr(handle obj, const char *name) {
-    PyObject *result = PyObject_GetAttrString(obj.ptr(), name);
-    if (!result) { throw error_already_set(); }
-    return reinterpret_steal<object>(result);
+  PyObject *result = PyObject_GetAttrString(obj.ptr(), name);
+  if (!result) {
+    throw error_already_set();
+  }
+  return reinterpret_steal<object>(result);
 }
 
 inline object getattr(handle obj, handle name, handle default_) {
-    if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) {
-        return reinterpret_steal<object>(result);
-    } else {
-        PyErr_Clear();
-        return reinterpret_borrow<object>(default_);
-    }
+  if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) {
+    return reinterpret_steal<object>(result);
+  } else {
+    PyErr_Clear();
+    return reinterpret_borrow<object>(default_);
+  }
 }
 
 inline object getattr(handle obj, const char *name, handle default_) {
-    if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) {
-        return reinterpret_steal<object>(result);
-    } else {
-        PyErr_Clear();
-        return reinterpret_borrow<object>(default_);
-    }
+  if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) {
+    return reinterpret_steal<object>(result);
+  } else {
+    PyErr_Clear();
+    return reinterpret_borrow<object>(default_);
+  }
 }
 
 inline void setattr(handle obj, handle name, handle value) {
-    if (PyObject_SetAttr(obj.ptr(), name.ptr(), value.ptr()) != 0) { throw error_already_set(); }
+  if (PyObject_SetAttr(obj.ptr(), name.ptr(), value.ptr()) != 0) {
+    throw error_already_set();
+  }
 }
 
 inline void setattr(handle obj, const char *name, handle value) {
-    if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) { throw error_already_set(); }
+  if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) {
+    throw error_already_set();
+  }
 }
 
 inline ssize_t hash(handle obj) {
-    auto h = PyObject_Hash(obj.ptr());
-    if (h == -1) { throw error_already_set(); }
-    return h;
+  auto h = PyObject_Hash(obj.ptr());
+  if (h == -1) {
+    throw error_already_set();
+  }
+  return h;
 }
 
 /// @} python_builtins
 
 NAMESPACE_BEGIN(detail)
 inline handle get_function(handle value) {
-    if (value) {
+  if (value) {
 #if PY_MAJOR_VERSION >= 3
-        if (PyInstanceMethod_Check(value.ptr()))
-            value = PyInstanceMethod_GET_FUNCTION(value.ptr());
-        else
+    if (PyInstanceMethod_Check(value.ptr()))
+      value = PyInstanceMethod_GET_FUNCTION(value.ptr());
+    else
 #endif
         if (PyMethod_Check(value.ptr()))
-            value = PyMethod_GET_FUNCTION(value.ptr());
-    }
-    return value;
+      value = PyMethod_GET_FUNCTION(value.ptr());
+  }
+  return value;
 }
 
-// Helper aliases/functions to support implicit casting of values given to python accessors/methods.
-// When given a pyobject, this simply returns the pyobject as-is; for other C++ type, the value goes
-// through pybind11::cast(obj) to convert it to an `object`.
+// Helper aliases/functions to support implicit casting of values given to
+// python accessors/methods. When given a pyobject, this simply returns the
+// pyobject as-is; for other C++ type, the value goes through
+// pybind11::cast(obj) to convert it to an `object`.
 template <typename T, enable_if_t<is_pyobject<T>::value, int> = 0>
-auto object_or_cast(T &&o) -> decltype(std::forward<T>(o)) { return std::forward<T>(o); }
+auto object_or_cast(T &&o) -> decltype(std::forward<T>(o)) {
+  return std::forward<T>(o);
+}
 // The following casting version is implemented in cast.h:
 template <typename T, enable_if_t<!is_pyobject<T>::value, int> = 0>
 object object_or_cast(T &&o);
-// Match a PyObject*, which we want to convert directly to handle via its converting constructor
+// Match a PyObject*, which we want to convert directly to handle via its
+// converting constructor
 inline handle object_or_cast(PyObject *ptr) { return ptr; }
 
 template <typename Policy>
 class accessor : public object_api<accessor<Policy>> {
-    using key_type = typename Policy::key_type;
+  using key_type = typename Policy::key_type;
 
 public:
-    accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) { }
-    accessor(const accessor &) = default;
-    accessor(accessor &&) = default;
+  accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) {}
+  accessor(const accessor &) = default;
+  accessor(accessor &&) = default;
 
-    // accessor overload required to override default assignment operator (templates are not allowed
-    // to replace default compiler-generated assignments).
-    void operator=(const accessor &a) && { std::move(*this).operator=(handle(a)); }
-    void operator=(const accessor &a) & { operator=(handle(a)); }
+  // accessor overload required to override default assignment operator
+  // (templates are not allowed to replace default compiler-generated
+  // assignments).
+  void operator=(const accessor &a) && {
+    std::move(*this).operator=(handle(a));
+  }
+  void operator=(const accessor &a) & { operator=(handle(a)); }
 
-    template <typename T> void operator=(T &&value) && {
-        Policy::set(obj, key, object_or_cast(std::forward<T>(value)));
-    }
-    template <typename T> void operator=(T &&value) & {
-        get_cache() = reinterpret_borrow<object>(object_or_cast(std::forward<T>(value)));
-    }
+  template <typename T> void operator=(T &&value) && {
+    Policy::set(obj, key, object_or_cast(std::forward<T>(value)));
+  }
+  template <typename T> void operator=(T &&value) & {
+    get_cache() =
+        reinterpret_borrow<object>(object_or_cast(std::forward<T>(value)));
+  }
 
-    template <typename T = Policy>
-    PYBIND11_DEPRECATED("Use of obj.attr(...) as bool is deprecated in favor of pybind11::hasattr(obj, ...)")
-    explicit operator enable_if_t<std::is_same<T, accessor_policies::str_attr>::value ||
-            std::is_same<T, accessor_policies::obj_attr>::value, bool>() const {
-        return hasattr(obj, key);
-    }
-    template <typename T = Policy>
-    PYBIND11_DEPRECATED("Use of obj[key] as bool is deprecated in favor of obj.contains(key)")
-    explicit operator enable_if_t<std::is_same<T, accessor_policies::generic_item>::value, bool>() const {
-        return obj.contains(key);
-    }
+  template <typename T = Policy>
+  PYBIND11_DEPRECATED("Use of obj.attr(...) as bool is deprecated in favor of "
+                      "pybind11::hasattr(obj, ...)")
+  explicit
+  operator enable_if_t<std::is_same<T, accessor_policies::str_attr>::value ||
+                           std::is_same<T, accessor_policies::obj_attr>::value,
+                       bool>() const {
+    return hasattr(obj, key);
+  }
+  template <typename T = Policy>
+  PYBIND11_DEPRECATED(
+      "Use of obj[key] as bool is deprecated in favor of obj.contains(key)")
+  explicit
+  operator enable_if_t<std::is_same<T, accessor_policies::generic_item>::value,
+                       bool>() const {
+    return obj.contains(key);
+  }
 
-    operator object() const { return get_cache(); }
-    PyObject *ptr() const { return get_cache().ptr(); }
-    template <typename T> T cast() const { return get_cache().template cast<T>(); }
+  operator object() const { return get_cache(); }
+  PyObject *ptr() const { return get_cache().ptr(); }
+  template <typename T> T cast() const {
+    return get_cache().template cast<T>();
+  }
 
 private:
-    object &get_cache() const {
-        if (!cache) { cache = Policy::get(obj, key); }
-        return cache;
+  object &get_cache() const {
+    if (!cache) {
+      cache = Policy::get(obj, key);
     }
+    return cache;
+  }
 
 private:
-    handle obj;
-    key_type key;
-    mutable object cache;
+  handle obj;
+  key_type key;
+  mutable object cache;
 };
 
 NAMESPACE_BEGIN(accessor_policies)
 struct obj_attr {
-    using key_type = object;
-    static object get(handle obj, handle key) { return getattr(obj, key); }
-    static void set(handle obj, handle key, handle val) { setattr(obj, key, val); }
+  using key_type = object;
+  static object get(handle obj, handle key) { return getattr(obj, key); }
+  static void set(handle obj, handle key, handle val) {
+    setattr(obj, key, val);
+  }
 };
 
 struct str_attr {
-    using key_type = const char *;
-    static object get(handle obj, const char *key) { return getattr(obj, key); }
-    static void set(handle obj, const char *key, handle val) { setattr(obj, key, val); }
+  using key_type = const char *;
+  static object get(handle obj, const char *key) { return getattr(obj, key); }
+  static void set(handle obj, const char *key, handle val) {
+    setattr(obj, key, val);
+  }
 };
 
 struct generic_item {
-    using key_type = object;
+  using key_type = object;
 
-    static object get(handle obj, handle key) {
-        PyObject *result = PyObject_GetItem(obj.ptr(), key.ptr());
-        if (!result) { throw error_already_set(); }
-        return reinterpret_steal<object>(result);
+  static object get(handle obj, handle key) {
+    PyObject *result = PyObject_GetItem(obj.ptr(), key.ptr());
+    if (!result) {
+      throw error_already_set();
     }
+    return reinterpret_steal<object>(result);
+  }
 
-    static void set(handle obj, handle key, handle val) {
-        if (PyObject_SetItem(obj.ptr(), key.ptr(), val.ptr()) != 0) { throw error_already_set(); }
+  static void set(handle obj, handle key, handle val) {
+    if (PyObject_SetItem(obj.ptr(), key.ptr(), val.ptr()) != 0) {
+      throw error_already_set();
     }
+  }
 };
 
 struct sequence_item {
-    using key_type = size_t;
+  using key_type = size_t;
 
-    static object get(handle obj, size_t index) {
-        PyObject *result = PySequence_GetItem(obj.ptr(), static_cast<ssize_t>(index));
-        if (!result) { throw error_already_set(); }
-        return reinterpret_steal<object>(result);
+  static object get(handle obj, size_t index) {
+    PyObject *result =
+        PySequence_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+    if (!result) {
+      throw error_already_set();
     }
+    return reinterpret_steal<object>(result);
+  }
 
-    static void set(handle obj, size_t index, handle val) {
-        // PySequence_SetItem does not steal a reference to 'val'
-        if (PySequence_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.ptr()) != 0) {
-            throw error_already_set();
-        }
+  static void set(handle obj, size_t index, handle val) {
+    // PySequence_SetItem does not steal a reference to 'val'
+    if (PySequence_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.ptr()) !=
+        0) {
+      throw error_already_set();
     }
+  }
 };
 
 struct list_item {
-    using key_type = size_t;
+  using key_type = size_t;
 
-    static object get(handle obj, size_t index) {
-        PyObject *result = PyList_GetItem(obj.ptr(), static_cast<ssize_t>(index));
-        if (!result) { throw error_already_set(); }
-        return reinterpret_borrow<object>(result);
+  static object get(handle obj, size_t index) {
+    PyObject *result = PyList_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+    if (!result) {
+      throw error_already_set();
     }
+    return reinterpret_borrow<object>(result);
+  }
 
-    static void set(handle obj, size_t index, handle val) {
-        // PyList_SetItem steals a reference to 'val'
-        if (PyList_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
-            throw error_already_set();
-        }
+  static void set(handle obj, size_t index, handle val) {
+    // PyList_SetItem steals a reference to 'val'
+    if (PyList_SetItem(obj.ptr(), static_cast<ssize_t>(index),
+                       val.inc_ref().ptr()) != 0) {
+      throw error_already_set();
     }
+  }
 };
 
 struct tuple_item {
-    using key_type = size_t;
+  using key_type = size_t;
 
-    static object get(handle obj, size_t index) {
-        PyObject *result = PyTuple_GetItem(obj.ptr(), static_cast<ssize_t>(index));
-        if (!result) { throw error_already_set(); }
-        return reinterpret_borrow<object>(result);
+  static object get(handle obj, size_t index) {
+    PyObject *result = PyTuple_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+    if (!result) {
+      throw error_already_set();
     }
+    return reinterpret_borrow<object>(result);
+  }
 
-    static void set(handle obj, size_t index, handle val) {
-        // PyTuple_SetItem steals a reference to 'val'
-        if (PyTuple_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
-            throw error_already_set();
-        }
+  static void set(handle obj, size_t index, handle val) {
+    // PyTuple_SetItem steals a reference to 'val'
+    if (PyTuple_SetItem(obj.ptr(), static_cast<ssize_t>(index),
+                        val.inc_ref().ptr()) != 0) {
+      throw error_already_set();
     }
+  }
 };
 NAMESPACE_END(accessor_policies)
 
 /// STL iterator template used for tuple, list, sequence and dict
-template <typename Policy>
-class generic_iterator : public Policy {
-    using It = generic_iterator;
+template <typename Policy> class generic_iterator : public Policy {
+  using It = generic_iterator;
 
 public:
-    using difference_type = ssize_t;
-    using iterator_category = typename Policy::iterator_category;
-    using value_type = typename Policy::value_type;
-    using reference = typename Policy::reference;
-    using pointer = typename Policy::pointer;
+  using difference_type = ssize_t;
+  using iterator_category = typename Policy::iterator_category;
+  using value_type = typename Policy::value_type;
+  using reference = typename Policy::reference;
+  using pointer = typename Policy::pointer;
 
-    generic_iterator() = default;
-    generic_iterator(handle seq, ssize_t index) : Policy(seq, index) { }
+  generic_iterator() = default;
+  generic_iterator(handle seq, ssize_t index) : Policy(seq, index) {}
 
-    reference operator*() const { return Policy::dereference(); }
-    reference operator[](difference_type n) const { return *(*this + n); }
-    pointer operator->() const { return **this; }
+  reference operator*() const { return Policy::dereference(); }
+  reference operator[](difference_type n) const { return *(*this + n); }
+  pointer operator->() const { return **this; }
 
-    It &operator++() { Policy::increment(); return *this; }
-    It operator++(int) { auto copy = *this; Policy::increment(); return copy; }
-    It &operator--() { Policy::decrement(); return *this; }
-    It operator--(int) { auto copy = *this; Policy::decrement(); return copy; }
-    It &operator+=(difference_type n) { Policy::advance(n); return *this; }
-    It &operator-=(difference_type n) { Policy::advance(-n); return *this; }
+  It &operator++() {
+    Policy::increment();
+    return *this;
+  }
+  It operator++(int) {
+    auto copy = *this;
+    Policy::increment();
+    return copy;
+  }
+  It &operator--() {
+    Policy::decrement();
+    return *this;
+  }
+  It operator--(int) {
+    auto copy = *this;
+    Policy::decrement();
+    return copy;
+  }
+  It &operator+=(difference_type n) {
+    Policy::advance(n);
+    return *this;
+  }
+  It &operator-=(difference_type n) {
+    Policy::advance(-n);
+    return *this;
+  }
 
-    friend It operator+(const It &a, difference_type n) { auto copy = a; return copy += n; }
-    friend It operator+(difference_type n, const It &b) { return b + n; }
-    friend It operator-(const It &a, difference_type n) { auto copy = a; return copy -= n; }
-    friend difference_type operator-(const It &a, const It &b) { return a.distance_to(b); }
+  friend It operator+(const It &a, difference_type n) {
+    auto copy = a;
+    return copy += n;
+  }
+  friend It operator+(difference_type n, const It &b) { return b + n; }
+  friend It operator-(const It &a, difference_type n) {
+    auto copy = a;
+    return copy -= n;
+  }
+  friend difference_type operator-(const It &a, const It &b) {
+    return a.distance_to(b);
+  }
 
-    friend bool operator==(const It &a, const It &b) { return a.equal(b); }
-    friend bool operator!=(const It &a, const It &b) { return !(a == b); }
-    friend bool operator< (const It &a, const It &b) { return b - a > 0; }
-    friend bool operator> (const It &a, const It &b) { return b < a; }
-    friend bool operator>=(const It &a, const It &b) { return !(a < b); }
-    friend bool operator<=(const It &a, const It &b) { return !(a > b); }
+  friend bool operator==(const It &a, const It &b) { return a.equal(b); }
+  friend bool operator!=(const It &a, const It &b) { return !(a == b); }
+  friend bool operator<(const It &a, const It &b) { return b - a > 0; }
+  friend bool operator>(const It &a, const It &b) { return b < a; }
+  friend bool operator>=(const It &a, const It &b) { return !(a < b); }
+  friend bool operator<=(const It &a, const It &b) { return !(a > b); }
 };
 
 NAMESPACE_BEGIN(iterator_policies)
-/// Quick proxy class needed to implement ``operator->`` for iterators which can't return pointers
-template <typename T>
-struct arrow_proxy {
-    T value;
+/// Quick proxy class needed to implement ``operator->`` for iterators which
+/// can't return pointers
+template <typename T> struct arrow_proxy {
+  T value;
 
-    arrow_proxy(T &&value) : value(std::move(value)) { }
-    T *operator->() const { return &value; }
+  arrow_proxy(T &&value) : value(std::move(value)) {}
+  T *operator->() const { return &value; }
 };
 
-/// Lightweight iterator policy using just a simple pointer: see ``PySequence_Fast_ITEMS``
+/// Lightweight iterator policy using just a simple pointer: see
+/// ``PySequence_Fast_ITEMS``
 class sequence_fast_readonly {
 protected:
-    using iterator_category = std::random_access_iterator_tag;
-    using value_type = handle;
-    using reference = const handle;
-    using pointer = arrow_proxy<const handle>;
+  using iterator_category = std::random_access_iterator_tag;
+  using value_type = handle;
+  using reference = const handle;
+  using pointer = arrow_proxy<const handle>;
 
-    sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) { }
+  sequence_fast_readonly(handle obj, ssize_t n)
+      : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) {}
 
-    reference dereference() const { return *ptr; }
-    void increment() { ++ptr; }
-    void decrement() { --ptr; }
-    void advance(ssize_t n) { ptr += n; }
-    bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; }
-    ssize_t distance_to(const sequence_fast_readonly &b) const { return ptr - b.ptr; }
+  reference dereference() const { return *ptr; }
+  void increment() { ++ptr; }
+  void decrement() { --ptr; }
+  void advance(ssize_t n) { ptr += n; }
+  bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; }
+  ssize_t distance_to(const sequence_fast_readonly &b) const {
+    return ptr - b.ptr;
+  }
 
 private:
-    PyObject **ptr;
+  PyObject **ptr;
 };
 
-/// Full read and write access using the sequence protocol: see ``detail::sequence_accessor``
+/// Full read and write access using the sequence protocol: see
+/// ``detail::sequence_accessor``
 class sequence_slow_readwrite {
 protected:
-    using iterator_category = std::random_access_iterator_tag;
-    using value_type = object;
-    using reference = sequence_accessor;
-    using pointer = arrow_proxy<const sequence_accessor>;
+  using iterator_category = std::random_access_iterator_tag;
+  using value_type = object;
+  using reference = sequence_accessor;
+  using pointer = arrow_proxy<const sequence_accessor>;
 
-    sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) { }
+  sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) {}
 
-    reference dereference() const { return {obj, static_cast<size_t>(index)}; }
-    void increment() { ++index; }
-    void decrement() { --index; }
-    void advance(ssize_t n) { index += n; }
-    bool equal(const sequence_slow_readwrite &b) const { return index == b.index; }
-    ssize_t distance_to(const sequence_slow_readwrite &b) const { return index - b.index; }
+  reference dereference() const { return {obj, static_cast<size_t>(index)}; }
+  void increment() { ++index; }
+  void decrement() { --index; }
+  void advance(ssize_t n) { index += n; }
+  bool equal(const sequence_slow_readwrite &b) const {
+    return index == b.index;
+  }
+  ssize_t distance_to(const sequence_slow_readwrite &b) const {
+    return index - b.index;
+  }
 
 private:
-    handle obj;
-    ssize_t index;
+  handle obj;
+  ssize_t index;
 };
 
 /// Python's dictionary protocol permits this to be a forward iterator
 class dict_readonly {
 protected:
-    using iterator_category = std::forward_iterator_tag;
-    using value_type = std::pair<handle, handle>;
-    using reference = const value_type;
-    using pointer = arrow_proxy<const value_type>;
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = std::pair<handle, handle>;
+  using reference = const value_type;
+  using pointer = arrow_proxy<const value_type>;
 
-    dict_readonly() = default;
-    dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
+  dict_readonly() = default;
+  dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
 
-    reference dereference() const { return {key, value}; }
-    void increment() { if (!PyDict_Next(obj.ptr(), &pos, &key, &value)) { pos = -1; } }
-    bool equal(const dict_readonly &b) const { return pos == b.pos; }
+  reference dereference() const { return {key, value}; }
+  void increment() {
+    if (!PyDict_Next(obj.ptr(), &pos, &key, &value)) {
+      pos = -1;
+    }
+  }
+  bool equal(const dict_readonly &b) const { return pos == b.pos; }
 
 private:
-    handle obj;
-    PyObject *key = nullptr, *value = nullptr;
-    ssize_t pos = -1;
+  handle obj;
+  PyObject *key = nullptr, *value = nullptr;
+  ssize_t pos = -1;
 };
 NAMESPACE_END(iterator_policies)
 
 #if !defined(PYPY_VERSION)
-using tuple_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
-using list_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+using tuple_iterator =
+    generic_iterator<iterator_policies::sequence_fast_readonly>;
+using list_iterator =
+    generic_iterator<iterator_policies::sequence_fast_readonly>;
 #else
-using tuple_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
-using list_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using tuple_iterator =
+    generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using list_iterator =
+    generic_iterator<iterator_policies::sequence_slow_readwrite>;
 #endif
 
-using sequence_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using sequence_iterator =
+    generic_iterator<iterator_policies::sequence_slow_readwrite>;
 using dict_iterator = generic_iterator<iterator_policies::dict_readonly>;
 
 inline bool PyIterable_Check(PyObject *obj) {
-    PyObject *iter = PyObject_GetIter(obj);
-    if (iter) {
-        Py_DECREF(iter);
-        return true;
-    } else {
-        PyErr_Clear();
-        return false;
-    }
+  PyObject *iter = PyObject_GetIter(obj);
+  if (iter) {
+    Py_DECREF(iter);
+    return true;
+  } else {
+    PyErr_Clear();
+    return false;
+  }
 }
 
 inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
@@ -740,29 +893,36 @@ inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
 inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
 #endif
 
-inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); }
+inline bool PyUnicode_Check_Permissive(PyObject *o) {
+  return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o);
+}
 
-inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
+inline bool PyStaticMethod_Check(PyObject *o) {
+  return o->ob_type == &PyStaticMethod_Type;
+}
 
 class kwargs_proxy : public handle {
 public:
-    explicit kwargs_proxy(handle h) : handle(h) { }
+  explicit kwargs_proxy(handle h) : handle(h) {}
 };
 
 class args_proxy : public handle {
 public:
-    explicit args_proxy(handle h) : handle(h) { }
-    kwargs_proxy operator*() const { return kwargs_proxy(*this); }
+  explicit args_proxy(handle h) : handle(h) {}
+  kwargs_proxy operator*() const { return kwargs_proxy(*this); }
 };
 
 /// Python argument categories (using PEP 448 terms)
 template <typename T> using is_keyword = std::is_base_of<arg, T>;
-template <typename T> using is_s_unpacking = std::is_same<args_proxy, T>; // * unpacking
-template <typename T> using is_ds_unpacking = std::is_same<kwargs_proxy, T>; // ** unpacking
-template <typename T> using is_positional = satisfies_none_of<T,
-    is_keyword, is_s_unpacking, is_ds_unpacking
->;
-template <typename T> using is_keyword_or_ds = satisfies_any_of<T, is_keyword, is_ds_unpacking>;
+template <typename T>
+using is_s_unpacking = std::is_same<args_proxy, T>; // * unpacking
+template <typename T>
+using is_ds_unpacking = std::is_same<kwargs_proxy, T>; // ** unpacking
+template <typename T>
+using is_positional =
+    satisfies_none_of<T, is_keyword, is_s_unpacking, is_ds_unpacking>;
+template <typename T>
+using is_keyword_or_ds = satisfies_any_of<T, is_keyword, is_ds_unpacking>;
 
 // Call argument collector forward declarations
 template <return_value_policy policy = return_value_policy::automatic_reference>
@@ -772,41 +932,55 @@ class unpacking_collector;
 
 NAMESPACE_END(detail)
 
-// TODO: After the deprecated constructors are removed, this macro can be simplified by
-//       inheriting ctors: `using Parent::Parent`. It's not an option right now because
-//       the `using` statement triggers the parent deprecation warning even if the ctor
-//       isn't even used.
-#define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
-    public: \
-        PYBIND11_DEPRECATED("Use reinterpret_borrow<"#Name">() or reinterpret_steal<"#Name">()") \
-        Name(handle h, bool is_borrowed) : Parent(is_borrowed ? Parent(h, borrowed_t{}) : Parent(h, stolen_t{})) { } \
-        Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) { } \
-        Name(handle h, stolen_t) : Parent(h, stolen_t{}) { } \
-        PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead") \
-        bool check() const { return m_ptr != nullptr && (bool) CheckFun(m_ptr); } \
-        static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); }
+// TODO: After the deprecated constructors are removed, this macro can be
+// simplified by
+//       inheriting ctors: `using Parent::Parent`. It's not an option right now
+//       because the `using` statement triggers the parent deprecation warning
+//       even if the ctor isn't even used.
+#define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                         \
+public:                                                                        \
+  PYBIND11_DEPRECATED("Use reinterpret_borrow<" #Name                          \
+                      ">() or reinterpret_steal<" #Name ">()")                 \
+  Name(handle h, bool is_borrowed)                                             \
+      : Parent(is_borrowed ? Parent(h, borrowed_t{})                           \
+                           : Parent(h, stolen_t{})) {}                         \
+  Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) {}                      \
+  Name(handle h, stolen_t) : Parent(h, stolen_t{}) {}                          \
+  PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead")      \
+  bool check() const { return m_ptr != nullptr && (bool)CheckFun(m_ptr); }     \
+  static bool check_(handle h) {                                               \
+    return h.ptr() != nullptr && CheckFun(h.ptr());                            \
+  }
 
-#define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun) \
-    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
-    /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
-    Name(const object &o) \
-    : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
-    { if (!m_ptr) throw error_already_set(); } \
-    Name(object &&o) \
-    : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
-    { if (!m_ptr) throw error_already_set(); } \
-    template <typename Policy_> \
-    Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) { }
+#define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun)                \
+  PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                               \
+  /* This is deliberately not 'explicit' to allow implicit conversion from     \
+   * object: */                                                                \
+  Name(const object &o)                                                        \
+      : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()),            \
+               stolen_t{}) {                                                   \
+    if (!m_ptr)                                                                \
+      throw error_already_set();                                               \
+  }                                                                            \
+  Name(object &&o)                                                             \
+      : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()),            \
+               stolen_t{}) {                                                   \
+    if (!m_ptr)                                                                \
+      throw error_already_set();                                               \
+  }                                                                            \
+  template <typename Policy_>                                                  \
+  Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) {}
 
-#define PYBIND11_OBJECT(Name, Parent, CheckFun) \
-    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
-    /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
-    Name(const object &o) : Parent(o) { } \
-    Name(object &&o) : Parent(std::move(o)) { }
+#define PYBIND11_OBJECT(Name, Parent, CheckFun)                                \
+  PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                               \
+  /* This is deliberately not 'explicit' to allow implicit conversion from     \
+   * object: */                                                                \
+  Name(const object &o) : Parent(o) {}                                         \
+  Name(object &&o) : Parent(std::move(o)) {}
 
-#define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun) \
-    PYBIND11_OBJECT(Name, Parent, CheckFun) \
-    Name() : Parent() { }
+#define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun)                        \
+  PYBIND11_OBJECT(Name, Parent, CheckFun)                                      \
+  Name() : Parent() {}
 
 /// \addtogroup pytypes
 /// @{
@@ -821,125 +995,139 @@ NAMESPACE_END(detail)
 \endrst */
 class iterator : public object {
 public:
-    using iterator_category = std::input_iterator_tag;
-    using difference_type = ssize_t;
-    using value_type = handle;
-    using reference = const handle;
-    using pointer = const handle *;
+  using iterator_category = std::input_iterator_tag;
+  using difference_type = ssize_t;
+  using value_type = handle;
+  using reference = const handle;
+  using pointer = const handle *;
 
-    PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
+  PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
 
-    iterator& operator++() {
-        advance();
-        return *this;
+  iterator &operator++() {
+    advance();
+    return *this;
+  }
+
+  iterator operator++(int) {
+    auto rv = *this;
+    advance();
+    return rv;
+  }
+
+  reference operator*() const {
+    if (m_ptr && !value.ptr()) {
+      auto &self = const_cast<iterator &>(*this);
+      self.advance();
     }
+    return value;
+  }
 
-    iterator operator++(int) {
-        auto rv = *this;
-        advance();
-        return rv;
-    }
+  pointer operator->() const {
+    operator*();
+    return &value;
+  }
 
-    reference operator*() const {
-        if (m_ptr && !value.ptr()) {
-            auto& self = const_cast<iterator &>(*this);
-            self.advance();
-        }
-        return value;
-    }
+  /** \rst
+       The value which marks the end of the iteration. ``it ==
+  iterator::sentinel()`` is equivalent to catching ``StopIteration`` in Python.
 
-    pointer operator->() const { operator*(); return &value; }
+       .. code-block:: cpp
 
-    /** \rst
-         The value which marks the end of the iteration. ``it == iterator::sentinel()``
-         is equivalent to catching ``StopIteration`` in Python.
+           void foo(py::iterator it) {
+               while (it != py::iterator::sentinel()) {
+                  // use `*it`
+                  ++it;
+               }
+           }
+  \endrst */
+  static iterator sentinel() { return {}; }
 
-         .. code-block:: cpp
-
-             void foo(py::iterator it) {
-                 while (it != py::iterator::sentinel()) {
-                    // use `*it`
-                    ++it;
-                 }
-             }
-    \endrst */
-    static iterator sentinel() { return {}; }
-
-    friend bool operator==(const iterator &a, const iterator &b) { return a->ptr() == b->ptr(); }
-    friend bool operator!=(const iterator &a, const iterator &b) { return a->ptr() != b->ptr(); }
+  friend bool operator==(const iterator &a, const iterator &b) {
+    return a->ptr() == b->ptr();
+  }
+  friend bool operator!=(const iterator &a, const iterator &b) {
+    return a->ptr() != b->ptr();
+  }
 
 private:
-    void advance() {
-        value = reinterpret_steal<object>(PyIter_Next(m_ptr));
-        if (PyErr_Occurred()) { throw error_already_set(); }
+  void advance() {
+    value = reinterpret_steal<object>(PyIter_Next(m_ptr));
+    if (PyErr_Occurred()) {
+      throw error_already_set();
     }
+  }
 
 private:
-    object value = {};
+  object value = {};
 };
 
 class iterable : public object {
 public:
-    PYBIND11_OBJECT_DEFAULT(iterable, object, detail::PyIterable_Check)
+  PYBIND11_OBJECT_DEFAULT(iterable, object, detail::PyIterable_Check)
 };
 
 class bytes;
 
 class str : public object {
 public:
-    PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
+  PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
 
-    str(const char *c, size_t n)
-        : object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate string object!");
+  str(const char *c, size_t n)
+      : object(PyUnicode_FromStringAndSize(c, (ssize_t)n), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate string object!");
+  }
+
+  // 'explicit' is explicitly omitted from the following constructors to allow
+  // implicit conversion to py::str from C++ string-like objects
+  str(const char *c = "") : object(PyUnicode_FromString(c), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate string object!");
+  }
+
+  str(const std::string &s) : str(s.data(), s.size()) {}
+
+  explicit str(const bytes &b);
+
+  /** \rst
+      Return a string representation of the object. This is analogous to
+      the ``str()`` function in Python.
+  \endrst */
+  explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) {}
+
+  operator std::string() const {
+    object temp = *this;
+    if (PyUnicode_Check(m_ptr)) {
+      temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(m_ptr));
+      if (!temp)
+        pybind11_fail("Unable to extract string contents! (encoding issue)");
     }
+    char *buffer;
+    ssize_t length;
+    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
+      pybind11_fail("Unable to extract string contents! (invalid type)");
+    return std::string(buffer, (size_t)length);
+  }
 
-    // 'explicit' is explicitly omitted from the following constructors to allow implicit conversion to py::str from C++ string-like objects
-    str(const char *c = "")
-        : object(PyUnicode_FromString(c), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate string object!");
-    }
-
-    str(const std::string &s) : str(s.data(), s.size()) { }
-
-    explicit str(const bytes &b);
-
-    /** \rst
-        Return a string representation of the object. This is analogous to
-        the ``str()`` function in Python.
-    \endrst */
-    explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) { }
-
-    operator std::string() const {
-        object temp = *this;
-        if (PyUnicode_Check(m_ptr)) {
-            temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(m_ptr));
-            if (!temp)
-                pybind11_fail("Unable to extract string contents! (encoding issue)");
-        }
-        char *buffer;
-        ssize_t length;
-        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
-            pybind11_fail("Unable to extract string contents! (invalid type)");
-        return std::string(buffer, (size_t) length);
-    }
-
-    template <typename... Args>
-    str format(Args &&...args) const {
-        return attr("format")(std::forward<Args>(args)...);
-    }
+  template <typename... Args> str format(Args &&... args) const {
+    return attr("format")(std::forward<Args>(args)...);
+  }
 
 private:
-    /// Return string representation -- always returns a new reference, even if already a str
-    static PyObject *raw_str(PyObject *op) {
-        PyObject *str_value = PyObject_Str(op);
+  /// Return string representation -- always returns a new reference, even if
+  /// already a str
+  static PyObject *raw_str(PyObject *op) {
+    PyObject *str_value = PyObject_Str(op);
 #if PY_MAJOR_VERSION < 3
-        if (!str_value) throw error_already_set();
-        PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
-        Py_XDECREF(str_value); str_value = unicode;
+    if (!str_value)
+      throw error_already_set();
+    PyObject *unicode =
+        PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
+    Py_XDECREF(str_value);
+    str_value = unicode;
 #endif
-        return str_value;
-    }
+    return str_value;
+  }
 };
 /// @} pytypes
 
@@ -948,520 +1136,596 @@ inline namespace literals {
     String literal version of `str`
  \endrst */
 inline str operator"" _s(const char *s, size_t size) { return {s, size}; }
-}
+} // namespace literals
 
 /// \addtogroup pytypes
 /// @{
 class bytes : public object {
 public:
-    PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
+  PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
 
-    // Allow implicit conversion:
-    bytes(const char *c = "")
-        : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
-    }
+  // Allow implicit conversion:
+  bytes(const char *c = "")
+      : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate bytes object!");
+  }
 
-    bytes(const char *c, size_t n)
-        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, (ssize_t) n), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
-    }
+  bytes(const char *c, size_t n)
+      : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, (ssize_t)n), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate bytes object!");
+  }
 
-    // Allow implicit conversion:
-    bytes(const std::string &s) : bytes(s.data(), s.size()) { }
+  // Allow implicit conversion:
+  bytes(const std::string &s) : bytes(s.data(), s.size()) {}
 
-    explicit bytes(const pybind11::str &s);
+  explicit bytes(const pybind11::str &s);
 
-    operator std::string() const {
-        char *buffer;
-        ssize_t length;
-        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(m_ptr, &buffer, &length))
-            pybind11_fail("Unable to extract bytes contents!");
-        return std::string(buffer, (size_t) length);
-    }
+  operator std::string() const {
+    char *buffer;
+    ssize_t length;
+    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(m_ptr, &buffer, &length))
+      pybind11_fail("Unable to extract bytes contents!");
+    return std::string(buffer, (size_t)length);
+  }
 };
 
 inline bytes::bytes(const pybind11::str &s) {
-    object temp = s;
-    if (PyUnicode_Check(s.ptr())) {
-        temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(s.ptr()));
-        if (!temp)
-            pybind11_fail("Unable to extract string contents! (encoding issue)");
-    }
-    char *buffer;
-    ssize_t length;
-    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
-        pybind11_fail("Unable to extract string contents! (invalid type)");
-    auto obj = reinterpret_steal<object>(PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length));
-    if (!obj)
-        pybind11_fail("Could not allocate bytes object!");
-    m_ptr = obj.release().ptr();
+  object temp = s;
+  if (PyUnicode_Check(s.ptr())) {
+    temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(s.ptr()));
+    if (!temp)
+      pybind11_fail("Unable to extract string contents! (encoding issue)");
+  }
+  char *buffer;
+  ssize_t length;
+  if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
+    pybind11_fail("Unable to extract string contents! (invalid type)");
+  auto obj = reinterpret_steal<object>(
+      PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length));
+  if (!obj)
+    pybind11_fail("Could not allocate bytes object!");
+  m_ptr = obj.release().ptr();
 }
 
-inline str::str(const bytes& b) {
-    char *buffer;
-    ssize_t length;
-    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(b.ptr(), &buffer, &length))
-        pybind11_fail("Unable to extract bytes contents!");
-    auto obj = reinterpret_steal<object>(PyUnicode_FromStringAndSize(buffer, (ssize_t) length));
-    if (!obj)
-        pybind11_fail("Could not allocate string object!");
-    m_ptr = obj.release().ptr();
+inline str::str(const bytes &b) {
+  char *buffer;
+  ssize_t length;
+  if (PYBIND11_BYTES_AS_STRING_AND_SIZE(b.ptr(), &buffer, &length))
+    pybind11_fail("Unable to extract bytes contents!");
+  auto obj = reinterpret_steal<object>(
+      PyUnicode_FromStringAndSize(buffer, (ssize_t)length));
+  if (!obj)
+    pybind11_fail("Could not allocate string object!");
+  m_ptr = obj.release().ptr();
 }
 
 class none : public object {
 public:
-    PYBIND11_OBJECT(none, object, detail::PyNone_Check)
-    none() : object(Py_None, borrowed_t{}) { }
+  PYBIND11_OBJECT(none, object, detail::PyNone_Check)
+  none() : object(Py_None, borrowed_t{}) {}
 };
 
 #if PY_MAJOR_VERSION >= 3
 class ellipsis : public object {
 public:
-    PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check)
-    ellipsis() : object(Py_Ellipsis, borrowed_t{}) { }
+  PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check)
+  ellipsis() : object(Py_Ellipsis, borrowed_t{}) {}
 };
 #endif
 
 class bool_ : public object {
 public:
-    PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
-    bool_() : object(Py_False, borrowed_t{}) { }
-    // Allow implicit conversion from and to `bool`:
-    bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) { }
-    operator bool() const { return m_ptr && PyLong_AsLong(m_ptr) != 0; }
+  PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
+  bool_() : object(Py_False, borrowed_t{}) {}
+  // Allow implicit conversion from and to `bool`:
+  bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) {}
+  operator bool() const { return m_ptr && PyLong_AsLong(m_ptr) != 0; }
 
 private:
-    /// Return the truth value of an object -- always returns a new reference
-    static PyObject *raw_bool(PyObject *op) {
-        const auto value = PyObject_IsTrue(op);
-        if (value == -1) return nullptr;
-        return handle(value ? Py_True : Py_False).inc_ref().ptr();
-    }
+  /// Return the truth value of an object -- always returns a new reference
+  static PyObject *raw_bool(PyObject *op) {
+    const auto value = PyObject_IsTrue(op);
+    if (value == -1)
+      return nullptr;
+    return handle(value ? Py_True : Py_False).inc_ref().ptr();
+  }
 };
 
 NAMESPACE_BEGIN(detail)
-// Converts a value to the given unsigned type.  If an error occurs, you get back (Unsigned) -1;
-// otherwise you get back the unsigned long or unsigned long long value cast to (Unsigned).
-// (The distinction is critically important when casting a returned -1 error value to some other
-// unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes).
-template <typename Unsigned>
-Unsigned as_unsigned(PyObject *o) {
-    if (sizeof(Unsigned) <= sizeof(unsigned long)
+// Converts a value to the given unsigned type.  If an error occurs, you get
+// back (Unsigned) -1; otherwise you get back the unsigned long or unsigned long
+// long value cast to (Unsigned). (The distinction is critically important when
+// casting a returned -1 error value to some other unsigned type: (A)-1 != (B)-1
+// when A and B are unsigned types of different sizes).
+template <typename Unsigned> Unsigned as_unsigned(PyObject *o) {
+  if (sizeof(Unsigned) <= sizeof(unsigned long)
 #if PY_VERSION_HEX < 0x03000000
-            || PyInt_Check(o)
+      || PyInt_Check(o)
 #endif
-    ) {
-        unsigned long v = PyLong_AsUnsignedLong(o);
-        return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
-    }
-    else {
-        unsigned long long v = PyLong_AsUnsignedLongLong(o);
-        return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
-    }
+  ) {
+    unsigned long v = PyLong_AsUnsignedLong(o);
+    return v == (unsigned long)-1 && PyErr_Occurred() ? (Unsigned)-1
+                                                      : (Unsigned)v;
+  } else {
+    unsigned long long v = PyLong_AsUnsignedLongLong(o);
+    return v == (unsigned long long)-1 && PyErr_Occurred() ? (Unsigned)-1
+                                                           : (Unsigned)v;
+  }
 }
 NAMESPACE_END(detail)
 
 class int_ : public object {
 public:
-    PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long)
-    int_() : object(PyLong_FromLong(0), stolen_t{}) { }
-    // Allow implicit conversion from C++ integral types:
-    template <typename T,
-              detail::enable_if_t<std::is_integral<T>::value, int> = 0>
-    int_(T value) {
-        if (sizeof(T) <= sizeof(long)) {
-            if (std::is_signed<T>::value)
-                m_ptr = PyLong_FromLong((long) value);
-            else
-                m_ptr = PyLong_FromUnsignedLong((unsigned long) value);
-        } else {
-            if (std::is_signed<T>::value)
-                m_ptr = PyLong_FromLongLong((long long) value);
-            else
-                m_ptr = PyLong_FromUnsignedLongLong((unsigned long long) value);
-        }
-        if (!m_ptr) pybind11_fail("Could not allocate int object!");
+  PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long)
+  int_() : object(PyLong_FromLong(0), stolen_t{}) {}
+  // Allow implicit conversion from C++ integral types:
+  template <typename T,
+            detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+  int_(T value) {
+    if (sizeof(T) <= sizeof(long)) {
+      if (std::is_signed<T>::value)
+        m_ptr = PyLong_FromLong((long)value);
+      else
+        m_ptr = PyLong_FromUnsignedLong((unsigned long)value);
+    } else {
+      if (std::is_signed<T>::value)
+        m_ptr = PyLong_FromLongLong((long long)value);
+      else
+        m_ptr = PyLong_FromUnsignedLongLong((unsigned long long)value);
     }
+    if (!m_ptr)
+      pybind11_fail("Could not allocate int object!");
+  }
 
-    template <typename T,
-              detail::enable_if_t<std::is_integral<T>::value, int> = 0>
-    operator T() const {
-        return std::is_unsigned<T>::value
-            ? detail::as_unsigned<T>(m_ptr)
-            : sizeof(T) <= sizeof(long)
-              ? (T) PyLong_AsLong(m_ptr)
-              : (T) PYBIND11_LONG_AS_LONGLONG(m_ptr);
-    }
+  template <typename T,
+            detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+  operator T() const {
+    return std::is_unsigned<T>::value
+               ? detail::as_unsigned<T>(m_ptr)
+               : sizeof(T) <= sizeof(long)
+                     ? (T)PyLong_AsLong(m_ptr)
+                     : (T)PYBIND11_LONG_AS_LONGLONG(m_ptr);
+  }
 };
 
 class float_ : public object {
 public:
-    PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
-    // Allow implicit conversion from float/double:
-    float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate float object!");
-    }
-    float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate float object!");
-    }
-    operator float() const { return (float) PyFloat_AsDouble(m_ptr); }
-    operator double() const { return (double) PyFloat_AsDouble(m_ptr); }
+  PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
+  // Allow implicit conversion from float/double:
+  float_(float value) : object(PyFloat_FromDouble((double)value), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate float object!");
+  }
+  float_(double value = .0)
+      : object(PyFloat_FromDouble((double)value), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate float object!");
+  }
+  operator float() const { return (float)PyFloat_AsDouble(m_ptr); }
+  operator double() const { return (double)PyFloat_AsDouble(m_ptr); }
 };
 
 class weakref : public object {
 public:
-    PYBIND11_OBJECT_DEFAULT(weakref, object, PyWeakref_Check)
-    explicit weakref(handle obj, handle callback = {})
-        : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate weak reference!");
-    }
+  PYBIND11_OBJECT_DEFAULT(weakref, object, PyWeakref_Check)
+  explicit weakref(handle obj, handle callback = {})
+      : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate weak reference!");
+  }
 };
 
 class slice : public object {
 public:
-    PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check)
-    slice(ssize_t start_, ssize_t stop_, ssize_t step_) {
-        int_ start(start_), stop(stop_), step(step_);
-        m_ptr = PySlice_New(start.ptr(), stop.ptr(), step.ptr());
-        if (!m_ptr) pybind11_fail("Could not allocate slice object!");
-    }
-    bool compute(size_t length, size_t *start, size_t *stop, size_t *step,
-                 size_t *slicelength) const {
-        return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
-                                    (ssize_t) length, (ssize_t *) start,
-                                    (ssize_t *) stop, (ssize_t *) step,
-                                    (ssize_t *) slicelength) == 0;
-    }
-    bool compute(ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step,
-      ssize_t *slicelength) const {
-      return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
-          length, start,
-          stop, step,
-          slicelength) == 0;
-    }
+  PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check)
+  slice(ssize_t start_, ssize_t stop_, ssize_t step_) {
+    int_ start(start_), stop(stop_), step(step_);
+    m_ptr = PySlice_New(start.ptr(), stop.ptr(), step.ptr());
+    if (!m_ptr)
+      pybind11_fail("Could not allocate slice object!");
+  }
+  bool compute(size_t length, size_t *start, size_t *stop, size_t *step,
+               size_t *slicelength) const {
+    return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *)m_ptr, (ssize_t)length,
+                                (ssize_t *)start, (ssize_t *)stop,
+                                (ssize_t *)step, (ssize_t *)slicelength) == 0;
+  }
+  bool compute(ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step,
+               ssize_t *slicelength) const {
+    return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *)m_ptr, length, start,
+                                stop, step, slicelength) == 0;
+  }
 };
 
 class capsule : public object {
 public:
-    PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact)
-    PYBIND11_DEPRECATED("Use reinterpret_borrow<capsule>() or reinterpret_steal<capsule>()")
-    capsule(PyObject *ptr, bool is_borrowed) : object(is_borrowed ? object(ptr, borrowed_t{}) : object(ptr, stolen_t{})) { }
+  PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact)
+  PYBIND11_DEPRECATED(
+      "Use reinterpret_borrow<capsule>() or reinterpret_steal<capsule>()")
+  capsule(PyObject *ptr, bool is_borrowed)
+      : object(is_borrowed ? object(ptr, borrowed_t{})
+                           : object(ptr, stolen_t{})) {}
 
-    explicit capsule(const void *value, const char *name = nullptr, void (*destructor)(PyObject *) = nullptr)
-        : object(PyCapsule_New(const_cast<void *>(value), name, destructor), stolen_t{}) {
-        if (!m_ptr)
-            pybind11_fail("Could not allocate capsule object!");
-    }
+  explicit capsule(const void *value, const char *name = nullptr,
+                   void (*destructor)(PyObject *) = nullptr)
+      : object(PyCapsule_New(const_cast<void *>(value), name, destructor),
+               stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate capsule object!");
+  }
 
-    PYBIND11_DEPRECATED("Please pass a destructor that takes a void pointer as input")
-    capsule(const void *value, void (*destruct)(PyObject *))
-        : object(PyCapsule_New(const_cast<void*>(value), nullptr, destruct), stolen_t{}) {
-        if (!m_ptr)
-            pybind11_fail("Could not allocate capsule object!");
-    }
+  PYBIND11_DEPRECATED(
+      "Please pass a destructor that takes a void pointer as input")
+  capsule(const void *value, void (*destruct)(PyObject *))
+      : object(PyCapsule_New(const_cast<void *>(value), nullptr, destruct),
+               stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate capsule object!");
+  }
 
-    capsule(const void *value, void (*destructor)(void *)) {
-        m_ptr = PyCapsule_New(const_cast<void *>(value), nullptr, [](PyObject *o) {
-            auto destructor = reinterpret_cast<void (*)(void *)>(PyCapsule_GetContext(o));
-            void *ptr = PyCapsule_GetPointer(o, nullptr);
-            destructor(ptr);
+  capsule(const void *value, void (*destructor)(void *)) {
+    m_ptr = PyCapsule_New(const_cast<void *>(value), nullptr, [](PyObject *o) {
+      auto destructor =
+          reinterpret_cast<void (*)(void *)>(PyCapsule_GetContext(o));
+      void *ptr = PyCapsule_GetPointer(o, nullptr);
+      destructor(ptr);
+    });
+
+    if (!m_ptr)
+      pybind11_fail("Could not allocate capsule object!");
+
+    if (PyCapsule_SetContext(m_ptr, (void *)destructor) != 0)
+      pybind11_fail("Could not set capsule context!");
+  }
+
+  capsule(void (*destructor)()) {
+    m_ptr = PyCapsule_New(
+        reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
+          auto destructor =
+              reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, nullptr));
+          destructor();
         });
 
-        if (!m_ptr)
-            pybind11_fail("Could not allocate capsule object!");
+    if (!m_ptr)
+      pybind11_fail("Could not allocate capsule object!");
+  }
 
-        if (PyCapsule_SetContext(m_ptr, (void *) destructor) != 0)
-            pybind11_fail("Could not set capsule context!");
-    }
+  template <typename T> operator T *() const {
+    auto name = this->name();
+    T *result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
+    if (!result)
+      pybind11_fail("Unable to extract capsule contents!");
+    return result;
+  }
 
-    capsule(void (*destructor)()) {
-        m_ptr = PyCapsule_New(reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
-            auto destructor = reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, nullptr));
-            destructor();
-        });
-
-        if (!m_ptr)
-            pybind11_fail("Could not allocate capsule object!");
-    }
-
-    template <typename T> operator T *() const {
-        auto name = this->name();
-        T * result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
-        if (!result) pybind11_fail("Unable to extract capsule contents!");
-        return result;
-    }
-
-    const char *name() const { return PyCapsule_GetName(m_ptr); }
+  const char *name() const { return PyCapsule_GetName(m_ptr); }
 };
 
 class tuple : public object {
 public:
-    PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
-    explicit tuple(size_t size = 0) : object(PyTuple_New((ssize_t) size), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate tuple object!");
-    }
-    size_t size() const { return (size_t) PyTuple_Size(m_ptr); }
-    detail::tuple_accessor operator[](size_t index) const { return {*this, index}; }
-    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
-    detail::tuple_iterator begin() const { return {*this, 0}; }
-    detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
+  PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
+  explicit tuple(size_t size = 0)
+      : object(PyTuple_New((ssize_t)size), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate tuple object!");
+  }
+  size_t size() const { return (size_t)PyTuple_Size(m_ptr); }
+  detail::tuple_accessor operator[](size_t index) const {
+    return {*this, index};
+  }
+  detail::item_accessor operator[](handle h) const {
+    return object::operator[](h);
+  }
+  detail::tuple_iterator begin() const { return {*this, 0}; }
+  detail::tuple_iterator end() const {
+    return {*this, PyTuple_GET_SIZE(m_ptr)};
+  }
 };
 
 class dict : public object {
 public:
-    PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
-    dict() : object(PyDict_New(), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate dict object!");
-    }
-    template <typename... Args,
-              typename = detail::enable_if_t<detail::all_of<detail::is_keyword_or_ds<Args>...>::value>,
-              // MSVC workaround: it can't compile an out-of-line definition, so defer the collector
-              typename collector = detail::deferred_t<detail::unpacking_collector<>, Args...>>
-    explicit dict(Args &&...args) : dict(collector(std::forward<Args>(args)...).kwargs()) { }
+  PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
+  dict() : object(PyDict_New(), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate dict object!");
+  }
+  template <typename... Args,
+            typename = detail::enable_if_t<
+                detail::all_of<detail::is_keyword_or_ds<Args>...>::value>,
+            // MSVC workaround: it can't compile an out-of-line definition, so
+            // defer the collector
+            typename collector =
+                detail::deferred_t<detail::unpacking_collector<>, Args...>>
+  explicit dict(Args &&... args)
+      : dict(collector(std::forward<Args>(args)...).kwargs()) {}
 
-    size_t size() const { return (size_t) PyDict_Size(m_ptr); }
-    detail::dict_iterator begin() const { return {*this, 0}; }
-    detail::dict_iterator end() const { return {}; }
-    void clear() const { PyDict_Clear(ptr()); }
-    bool contains(handle key) const { return PyDict_Contains(ptr(), key.ptr()) == 1; }
-    bool contains(const char *key) const { return PyDict_Contains(ptr(), pybind11::str(key).ptr()) == 1; }
+  size_t size() const { return (size_t)PyDict_Size(m_ptr); }
+  detail::dict_iterator begin() const { return {*this, 0}; }
+  detail::dict_iterator end() const { return {}; }
+  void clear() const { PyDict_Clear(ptr()); }
+  bool contains(handle key) const {
+    return PyDict_Contains(ptr(), key.ptr()) == 1;
+  }
+  bool contains(const char *key) const {
+    return PyDict_Contains(ptr(), pybind11::str(key).ptr()) == 1;
+  }
 
 private:
-    /// Call the `dict` Python type -- always returns a new reference
-    static PyObject *raw_dict(PyObject *op) {
-        if (PyDict_Check(op))
-            return handle(op).inc_ref().ptr();
-        return PyObject_CallFunctionObjArgs((PyObject *) &PyDict_Type, op, nullptr);
-    }
+  /// Call the `dict` Python type -- always returns a new reference
+  static PyObject *raw_dict(PyObject *op) {
+    if (PyDict_Check(op))
+      return handle(op).inc_ref().ptr();
+    return PyObject_CallFunctionObjArgs((PyObject *)&PyDict_Type, op, nullptr);
+  }
 };
 
 class sequence : public object {
 public:
-    PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check)
-    size_t size() const { return (size_t) PySequence_Size(m_ptr); }
-    detail::sequence_accessor operator[](size_t index) const { return {*this, index}; }
-    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
-    detail::sequence_iterator begin() const { return {*this, 0}; }
-    detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; }
+  PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check)
+  size_t size() const { return (size_t)PySequence_Size(m_ptr); }
+  detail::sequence_accessor operator[](size_t index) const {
+    return {*this, index};
+  }
+  detail::item_accessor operator[](handle h) const {
+    return object::operator[](h);
+  }
+  detail::sequence_iterator begin() const { return {*this, 0}; }
+  detail::sequence_iterator end() const {
+    return {*this, PySequence_Size(m_ptr)};
+  }
 };
 
 class list : public object {
 public:
-    PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
-    explicit list(size_t size = 0) : object(PyList_New((ssize_t) size), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate list object!");
-    }
-    size_t size() const { return (size_t) PyList_Size(m_ptr); }
-    detail::list_accessor operator[](size_t index) const { return {*this, index}; }
-    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
-    detail::list_iterator begin() const { return {*this, 0}; }
-    detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
-    template <typename T> void append(T &&val) const {
-        PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
-    }
+  PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
+  explicit list(size_t size = 0)
+      : object(PyList_New((ssize_t)size), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate list object!");
+  }
+  size_t size() const { return (size_t)PyList_Size(m_ptr); }
+  detail::list_accessor operator[](size_t index) const {
+    return {*this, index};
+  }
+  detail::item_accessor operator[](handle h) const {
+    return object::operator[](h);
+  }
+  detail::list_iterator begin() const { return {*this, 0}; }
+  detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
+  template <typename T> void append(T &&val) const {
+    PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
+  }
 };
 
-class args : public tuple { PYBIND11_OBJECT_DEFAULT(args, tuple, PyTuple_Check) };
-class kwargs : public dict { PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check)  };
+class args : public tuple {
+  PYBIND11_OBJECT_DEFAULT(args, tuple, PyTuple_Check)
+};
+class kwargs : public dict {
+  PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check)
+};
 
 class set : public object {
 public:
-    PYBIND11_OBJECT_CVT(set, object, PySet_Check, PySet_New)
-    set() : object(PySet_New(nullptr), stolen_t{}) {
-        if (!m_ptr) pybind11_fail("Could not allocate set object!");
-    }
-    size_t size() const { return (size_t) PySet_Size(m_ptr); }
-    template <typename T> bool add(T &&val) const {
-        return PySet_Add(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 0;
-    }
-    void clear() const { PySet_Clear(m_ptr); }
+  PYBIND11_OBJECT_CVT(set, object, PySet_Check, PySet_New)
+  set() : object(PySet_New(nullptr), stolen_t{}) {
+    if (!m_ptr)
+      pybind11_fail("Could not allocate set object!");
+  }
+  size_t size() const { return (size_t)PySet_Size(m_ptr); }
+  template <typename T> bool add(T &&val) const {
+    return PySet_Add(m_ptr,
+                     detail::object_or_cast(std::forward<T>(val)).ptr()) == 0;
+  }
+  void clear() const { PySet_Clear(m_ptr); }
 };
 
 class function : public object {
 public:
-    PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check)
-    handle cpp_function() const {
-        handle fun = detail::get_function(m_ptr);
-        if (fun && PyCFunction_Check(fun.ptr()))
-            return fun;
-        return handle();
-    }
-    bool is_cpp_function() const { return (bool) cpp_function(); }
+  PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check)
+  handle cpp_function() const {
+    handle fun = detail::get_function(m_ptr);
+    if (fun && PyCFunction_Check(fun.ptr()))
+      return fun;
+    return handle();
+  }
+  bool is_cpp_function() const { return (bool)cpp_function(); }
 };
 
 class staticmethod : public object {
 public:
-    PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New)
+  PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check,
+                      PyStaticMethod_New)
 };
 
 class buffer : public object {
 public:
-    PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer)
+  PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer)
 
-    buffer_info request(bool writable = false) {
-        int flags = PyBUF_STRIDES | PyBUF_FORMAT;
-        if (writable) flags |= PyBUF_WRITABLE;
-        Py_buffer *view = new Py_buffer();
-        if (PyObject_GetBuffer(m_ptr, view, flags) != 0) {
-            delete view;
-            throw error_already_set();
-        }
-        return buffer_info(view);
+  buffer_info request(bool writable = false) {
+    int flags = PyBUF_STRIDES | PyBUF_FORMAT;
+    if (writable)
+      flags |= PyBUF_WRITABLE;
+    Py_buffer *view = new Py_buffer();
+    if (PyObject_GetBuffer(m_ptr, view, flags) != 0) {
+      delete view;
+      throw error_already_set();
     }
+    return buffer_info(view);
+  }
 };
 
 class memoryview : public object {
 public:
-    explicit memoryview(const buffer_info& info) {
-        static Py_buffer buf { };
-        // Py_buffer uses signed sizes, strides and shape!..
-        static std::vector<Py_ssize_t> py_strides { };
-        static std::vector<Py_ssize_t> py_shape { };
-        buf.buf = info.ptr;
-        buf.itemsize = info.itemsize;
-        buf.format = const_cast<char *>(info.format.c_str());
-        buf.ndim = (int) info.ndim;
-        buf.len = info.size;
-        py_strides.clear();
-        py_shape.clear();
-        for (size_t i = 0; i < (size_t) info.ndim; ++i) {
-            py_strides.push_back(info.strides[i]);
-            py_shape.push_back(info.shape[i]);
-        }
-        buf.strides = py_strides.data();
-        buf.shape = py_shape.data();
-        buf.suboffsets = nullptr;
-        buf.readonly = false;
-        buf.internal = nullptr;
-
-        m_ptr = PyMemoryView_FromBuffer(&buf);
-        if (!m_ptr)
-            pybind11_fail("Unable to create memoryview from buffer descriptor");
+  explicit memoryview(const buffer_info &info) {
+    static Py_buffer buf{};
+    // Py_buffer uses signed sizes, strides and shape!..
+    static std::vector<Py_ssize_t> py_strides{};
+    static std::vector<Py_ssize_t> py_shape{};
+    buf.buf = info.ptr;
+    buf.itemsize = info.itemsize;
+    buf.format = const_cast<char *>(info.format.c_str());
+    buf.ndim = (int)info.ndim;
+    buf.len = info.size;
+    py_strides.clear();
+    py_shape.clear();
+    for (size_t i = 0; i < (size_t)info.ndim; ++i) {
+      py_strides.push_back(info.strides[i]);
+      py_shape.push_back(info.shape[i]);
     }
+    buf.strides = py_strides.data();
+    buf.shape = py_shape.data();
+    buf.suboffsets = nullptr;
+    buf.readonly = false;
+    buf.internal = nullptr;
 
-    PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check, PyMemoryView_FromObject)
+    m_ptr = PyMemoryView_FromBuffer(&buf);
+    if (!m_ptr)
+      pybind11_fail("Unable to create memoryview from buffer descriptor");
+  }
+
+  PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check,
+                      PyMemoryView_FromObject)
 };
 /// @} pytypes
 
 /// \addtogroup python_builtins
 /// @{
 inline size_t len(handle h) {
-    ssize_t result = PyObject_Length(h.ptr());
-    if (result < 0)
-        pybind11_fail("Unable to compute length of object");
-    return (size_t) result;
+  ssize_t result = PyObject_Length(h.ptr());
+  if (result < 0)
+    pybind11_fail("Unable to compute length of object");
+  return (size_t)result;
 }
 
 inline size_t len_hint(handle h) {
 #if PY_VERSION_HEX >= 0x03040000
-    ssize_t result = PyObject_LengthHint(h.ptr(), 0);
+  ssize_t result = PyObject_LengthHint(h.ptr(), 0);
 #else
-    ssize_t result = PyObject_Length(h.ptr());
+  ssize_t result = PyObject_Length(h.ptr());
 #endif
-    if (result < 0) {
-        // Sometimes a length can't be determined at all (eg generators)
-        // In which case simply return 0
-        PyErr_Clear();
-        return 0;
-    }
-    return (size_t) result;
+  if (result < 0) {
+    // Sometimes a length can't be determined at all (eg generators)
+    // In which case simply return 0
+    PyErr_Clear();
+    return 0;
+  }
+  return (size_t)result;
 }
 
 inline str repr(handle h) {
-    PyObject *str_value = PyObject_Repr(h.ptr());
-    if (!str_value) throw error_already_set();
+  PyObject *str_value = PyObject_Repr(h.ptr());
+  if (!str_value)
+    throw error_already_set();
 #if PY_MAJOR_VERSION < 3
-    PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
-    Py_XDECREF(str_value); str_value = unicode;
-    if (!str_value) throw error_already_set();
+  PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
+  Py_XDECREF(str_value);
+  str_value = unicode;
+  if (!str_value)
+    throw error_already_set();
 #endif
-    return reinterpret_steal<str>(str_value);
+  return reinterpret_steal<str>(str_value);
 }
 
 inline iterator iter(handle obj) {
-    PyObject *result = PyObject_GetIter(obj.ptr());
-    if (!result) { throw error_already_set(); }
-    return reinterpret_steal<iterator>(result);
+  PyObject *result = PyObject_GetIter(obj.ptr());
+  if (!result) {
+    throw error_already_set();
+  }
+  return reinterpret_steal<iterator>(result);
 }
 /// @} python_builtins
 
 NAMESPACE_BEGIN(detail)
-template <typename D> iterator object_api<D>::begin() const { return iter(derived()); }
-template <typename D> iterator object_api<D>::end() const { return iterator::sentinel(); }
-template <typename D> item_accessor object_api<D>::operator[](handle key) const {
-    return {derived(), reinterpret_borrow<object>(key)};
+template <typename D> iterator object_api<D>::begin() const {
+  return iter(derived());
 }
-template <typename D> item_accessor object_api<D>::operator[](const char *key) const {
-    return {derived(), pybind11::str(key)};
+template <typename D> iterator object_api<D>::end() const {
+  return iterator::sentinel();
+}
+template <typename D>
+item_accessor object_api<D>::operator[](handle key) const {
+  return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D>
+item_accessor object_api<D>::operator[](const char *key) const {
+  return {derived(), pybind11::str(key)};
 }
 template <typename D> obj_attr_accessor object_api<D>::attr(handle key) const {
-    return {derived(), reinterpret_borrow<object>(key)};
+  return {derived(), reinterpret_borrow<object>(key)};
 }
-template <typename D> str_attr_accessor object_api<D>::attr(const char *key) const {
-    return {derived(), key};
+template <typename D>
+str_attr_accessor object_api<D>::attr(const char *key) const {
+  return {derived(), key};
 }
 template <typename D> args_proxy object_api<D>::operator*() const {
-    return args_proxy(derived().ptr());
+  return args_proxy(derived().ptr());
 }
-template <typename D> template <typename T> bool object_api<D>::contains(T &&item) const {
-    return attr("__contains__")(std::forward<T>(item)).template cast<bool>();
+template <typename D>
+template <typename T>
+bool object_api<D>::contains(T &&item) const {
+  return attr("__contains__")(std::forward<T>(item)).template cast<bool>();
 }
 
-template <typename D>
-pybind11::str object_api<D>::str() const { return pybind11::str(derived()); }
+template <typename D> pybind11::str object_api<D>::str() const {
+  return pybind11::str(derived());
+}
 
-template <typename D>
-str_attr_accessor object_api<D>::doc() const { return attr("__doc__"); }
+template <typename D> str_attr_accessor object_api<D>::doc() const {
+  return attr("__doc__");
+}
 
-template <typename D>
-handle object_api<D>::get_type() const { return (PyObject *) Py_TYPE(derived().ptr()); }
+template <typename D> handle object_api<D>::get_type() const {
+  return (PyObject *)Py_TYPE(derived().ptr());
+}
 
 template <typename D>
 bool object_api<D>::rich_compare(object_api const &other, int value) const {
-    int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value);
-    if (rv == -1)
-        throw error_already_set();
-    return rv == 1;
+  int rv =
+      PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value);
+  if (rv == -1)
+    throw error_already_set();
+  return rv == 1;
 }
 
 #define PYBIND11_MATH_OPERATOR_UNARY(op, fn)                                   \
-    template <typename D> object object_api<D>::op() const {                   \
-        object result = reinterpret_steal<object>(fn(derived().ptr()));        \
-        if (!result.ptr())                                                     \
-            throw error_already_set();                                         \
-        return result;                                                         \
-    }
+  template <typename D> object object_api<D>::op() const {                     \
+    object result = reinterpret_steal<object>(fn(derived().ptr()));            \
+    if (!result.ptr())                                                         \
+      throw error_already_set();                                               \
+    return result;                                                             \
+  }
 
 #define PYBIND11_MATH_OPERATOR_BINARY(op, fn)                                  \
-    template <typename D>                                                      \
-    object object_api<D>::op(object_api const &other) const {                  \
-        object result = reinterpret_steal<object>(                             \
-            fn(derived().ptr(), other.derived().ptr()));                       \
-        if (!result.ptr())                                                     \
-            throw error_already_set();                                         \
-        return result;                                                         \
-    }
+  template <typename D>                                                        \
+  object object_api<D>::op(object_api const &other) const {                    \
+    object result =                                                            \
+        reinterpret_steal<object>(fn(derived().ptr(), other.derived().ptr())); \
+    if (!result.ptr())                                                         \
+      throw error_already_set();                                               \
+    return result;                                                             \
+  }
 
-PYBIND11_MATH_OPERATOR_UNARY (operator~,   PyNumber_Invert)
-PYBIND11_MATH_OPERATOR_UNARY (operator-,   PyNumber_Negative)
-PYBIND11_MATH_OPERATOR_BINARY(operator+,   PyNumber_Add)
-PYBIND11_MATH_OPERATOR_BINARY(operator+=,  PyNumber_InPlaceAdd)
-PYBIND11_MATH_OPERATOR_BINARY(operator-,   PyNumber_Subtract)
-PYBIND11_MATH_OPERATOR_BINARY(operator-=,  PyNumber_InPlaceSubtract)
-PYBIND11_MATH_OPERATOR_BINARY(operator*,   PyNumber_Multiply)
-PYBIND11_MATH_OPERATOR_BINARY(operator*=,  PyNumber_InPlaceMultiply)
-PYBIND11_MATH_OPERATOR_BINARY(operator/,   PyNumber_TrueDivide)
-PYBIND11_MATH_OPERATOR_BINARY(operator/=,  PyNumber_InPlaceTrueDivide)
-PYBIND11_MATH_OPERATOR_BINARY(operator|,   PyNumber_Or)
-PYBIND11_MATH_OPERATOR_BINARY(operator|=,  PyNumber_InPlaceOr)
-PYBIND11_MATH_OPERATOR_BINARY(operator&,   PyNumber_And)
-PYBIND11_MATH_OPERATOR_BINARY(operator&=,  PyNumber_InPlaceAnd)
-PYBIND11_MATH_OPERATOR_BINARY(operator^,   PyNumber_Xor)
-PYBIND11_MATH_OPERATOR_BINARY(operator^=,  PyNumber_InPlaceXor)
-PYBIND11_MATH_OPERATOR_BINARY(operator<<,  PyNumber_Lshift)
+PYBIND11_MATH_OPERATOR_UNARY(operator~, PyNumber_Invert)
+PYBIND11_MATH_OPERATOR_UNARY(operator-, PyNumber_Negative)
+PYBIND11_MATH_OPERATOR_BINARY(operator+, PyNumber_Add)
+PYBIND11_MATH_OPERATOR_BINARY(operator+=, PyNumber_InPlaceAdd)
+PYBIND11_MATH_OPERATOR_BINARY(operator-, PyNumber_Subtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator-=, PyNumber_InPlaceSubtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator*, PyNumber_Multiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator*=, PyNumber_InPlaceMultiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator/, PyNumber_TrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator/=, PyNumber_InPlaceTrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator|, PyNumber_Or)
+PYBIND11_MATH_OPERATOR_BINARY(operator|=, PyNumber_InPlaceOr)
+PYBIND11_MATH_OPERATOR_BINARY(operator&, PyNumber_And)
+PYBIND11_MATH_OPERATOR_BINARY(operator&=, PyNumber_InPlaceAnd)
+PYBIND11_MATH_OPERATOR_BINARY(operator^, PyNumber_Xor)
+PYBIND11_MATH_OPERATOR_BINARY(operator^=, PyNumber_InPlaceXor)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<, PyNumber_Lshift)
 PYBIND11_MATH_OPERATOR_BINARY(operator<<=, PyNumber_InPlaceLshift)
-PYBIND11_MATH_OPERATOR_BINARY(operator>>,  PyNumber_Rshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>, PyNumber_Rshift)
 PYBIND11_MATH_OPERATOR_BINARY(operator>>=, PyNumber_InPlaceRshift)
 
 #undef PYBIND11_MATH_OPERATOR_UNARY
diff --git a/python/src/pybind11/stl.h b/python/src/pybind11/stl.h
index 32f8d294a..95f890150 100644
--- a/python/src/pybind11/stl.h
+++ b/python/src/pybind11/stl.h
@@ -10,373 +10,411 @@
 #pragma once
 
 #include "pybind11.h"
-#include <set>
-#include <unordered_set>
-#include <map>
-#include <unordered_map>
+#include <deque>
 #include <iostream>
 #include <list>
-#include <deque>
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
 #include <valarray>
 
 #if defined(_MSC_VER)
 #pragma warning(push)
-#pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#pragma warning(                                                               \
+    disable : 4127) // warning C4127: Conditional expression is constant
 #endif
 
 #ifdef __has_include
 // std::optional (but including it in c++14 mode isn't allowed)
-#  if defined(PYBIND11_CPP17) && __has_include(<optional>)
-#    include <optional>
-#    define PYBIND11_HAS_OPTIONAL 1
-#  endif
+#if defined(PYBIND11_CPP17) && __has_include(<optional>)
+#include <optional>
+#define PYBIND11_HAS_OPTIONAL 1
+#endif
 // std::experimental::optional (but not allowed in c++11 mode)
-#  if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
+#if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
                                  !__has_include(<optional>))
-#    include <experimental/optional>
-#    define PYBIND11_HAS_EXP_OPTIONAL 1
-#  endif
+#include <experimental/optional>
+#define PYBIND11_HAS_EXP_OPTIONAL 1
+#endif
 // std::variant
-#  if defined(PYBIND11_CPP17) && __has_include(<variant>)
-#    include <variant>
-#    define PYBIND11_HAS_VARIANT 1
-#  endif
+#if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#include <variant>
+#define PYBIND11_HAS_VARIANT 1
+#endif
 #elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
-#  include <optional>
-#  include <variant>
-#  define PYBIND11_HAS_OPTIONAL 1
-#  define PYBIND11_HAS_VARIANT 1
+#include <optional>
+#include <variant>
+#define PYBIND11_HAS_OPTIONAL 1
+#define PYBIND11_HAS_VARIANT 1
 #endif
 
 NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
-/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for
-/// forwarding a container element).  Typically used indirect via forwarded_type(), below.
+/// Extracts an const lvalue reference or rvalue reference for U based on the
+/// type of T (e.g. for forwarding a container element).  Typically used
+/// indirect via forwarded_type(), below.
 template <typename T, typename U>
-using forwarded_type = conditional_t<
-    std::is_lvalue_reference<T>::value, remove_reference_t<U> &, remove_reference_t<U> &&>;
+using forwarded_type =
+    conditional_t<std::is_lvalue_reference<T>::value, remove_reference_t<U> &,
+                  remove_reference_t<U> &&>;
 
-/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically
-/// used for forwarding a container's elements.
-template <typename T, typename U>
-forwarded_type<T, U> forward_like(U &&u) {
-    return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
+/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or
+/// lvalue; typically used for forwarding a container's elements.
+template <typename T, typename U> forwarded_type<T, U> forward_like(U &&u) {
+  return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
 }
 
 template <typename Type, typename Key> struct set_caster {
-    using type = Type;
-    using key_conv = make_caster<Key>;
+  using type = Type;
+  using key_conv = make_caster<Key>;
 
-    bool load(handle src, bool convert) {
-        if (!isinstance<pybind11::set>(src))
-            return false;
-        auto s = reinterpret_borrow<pybind11::set>(src);
-        value.clear();
-        for (auto entry : s) {
-            key_conv conv;
-            if (!conv.load(entry, convert))
-                return false;
-            value.insert(cast_op<Key &&>(std::move(conv)));
-        }
-        return true;
+  bool load(handle src, bool convert) {
+    if (!isinstance<pybind11::set>(src))
+      return false;
+    auto s = reinterpret_borrow<pybind11::set>(src);
+    value.clear();
+    for (auto entry : s) {
+      key_conv conv;
+      if (!conv.load(entry, convert))
+        return false;
+      value.insert(cast_op<Key &&>(std::move(conv)));
     }
+    return true;
+  }
 
-    template <typename T>
-    static handle cast(T &&src, return_value_policy policy, handle parent) {
-        if (!std::is_lvalue_reference<T>::value)
-            policy = return_value_policy_override<Key>::policy(policy);
-        pybind11::set s;
-        for (auto &&value : src) {
-            auto value_ = reinterpret_steal<object>(key_conv::cast(forward_like<T>(value), policy, parent));
-            if (!value_ || !s.add(value_))
-                return handle();
-        }
-        return s.release();
+  template <typename T>
+  static handle cast(T &&src, return_value_policy policy, handle parent) {
+    if (!std::is_lvalue_reference<T>::value)
+      policy = return_value_policy_override<Key>::policy(policy);
+    pybind11::set s;
+    for (auto &&value : src) {
+      auto value_ = reinterpret_steal<object>(
+          key_conv::cast(forward_like<T>(value), policy, parent));
+      if (!value_ || !s.add(value_))
+        return handle();
     }
+    return s.release();
+  }
 
-    PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]"));
+  PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]"));
 };
 
 template <typename Type, typename Key, typename Value> struct map_caster {
-    using key_conv   = make_caster<Key>;
-    using value_conv = make_caster<Value>;
+  using key_conv = make_caster<Key>;
+  using value_conv = make_caster<Value>;
 
-    bool load(handle src, bool convert) {
-        if (!isinstance<dict>(src))
-            return false;
-        auto d = reinterpret_borrow<dict>(src);
-        value.clear();
-        for (auto it : d) {
-            key_conv kconv;
-            value_conv vconv;
-            if (!kconv.load(it.first.ptr(), convert) ||
-                !vconv.load(it.second.ptr(), convert))
-                return false;
-            value.emplace(cast_op<Key &&>(std::move(kconv)), cast_op<Value &&>(std::move(vconv)));
-        }
-        return true;
+  bool load(handle src, bool convert) {
+    if (!isinstance<dict>(src))
+      return false;
+    auto d = reinterpret_borrow<dict>(src);
+    value.clear();
+    for (auto it : d) {
+      key_conv kconv;
+      value_conv vconv;
+      if (!kconv.load(it.first.ptr(), convert) ||
+          !vconv.load(it.second.ptr(), convert))
+        return false;
+      value.emplace(cast_op<Key &&>(std::move(kconv)),
+                    cast_op<Value &&>(std::move(vconv)));
     }
+    return true;
+  }
 
-    template <typename T>
-    static handle cast(T &&src, return_value_policy policy, handle parent) {
-        dict d;
-        return_value_policy policy_key = policy;
-        return_value_policy policy_value = policy;
-        if (!std::is_lvalue_reference<T>::value) {
-            policy_key = return_value_policy_override<Key>::policy(policy_key);
-            policy_value = return_value_policy_override<Value>::policy(policy_value);
-        }
-        for (auto &&kv : src) {
-            auto key = reinterpret_steal<object>(key_conv::cast(forward_like<T>(kv.first), policy_key, parent));
-            auto value = reinterpret_steal<object>(value_conv::cast(forward_like<T>(kv.second), policy_value, parent));
-            if (!key || !value)
-                return handle();
-            d[key] = value;
-        }
-        return d.release();
+  template <typename T>
+  static handle cast(T &&src, return_value_policy policy, handle parent) {
+    dict d;
+    return_value_policy policy_key = policy;
+    return_value_policy policy_value = policy;
+    if (!std::is_lvalue_reference<T>::value) {
+      policy_key = return_value_policy_override<Key>::policy(policy_key);
+      policy_value = return_value_policy_override<Value>::policy(policy_value);
     }
+    for (auto &&kv : src) {
+      auto key = reinterpret_steal<object>(
+          key_conv::cast(forward_like<T>(kv.first), policy_key, parent));
+      auto value = reinterpret_steal<object>(
+          value_conv::cast(forward_like<T>(kv.second), policy_value, parent));
+      if (!key || !value)
+        return handle();
+      d[key] = value;
+    }
+    return d.release();
+  }
 
-    PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") + value_conv::name + _("]"));
+  PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") +
+                                 value_conv::name + _("]"));
 };
 
 template <typename Type, typename Value> struct list_caster {
-    using value_conv = make_caster<Value>;
+  using value_conv = make_caster<Value>;
 
-    bool load(handle src, bool convert) {
-        if (!isinstance<sequence>(src) || isinstance<str>(src))
-            return false;
-        auto s = reinterpret_borrow<sequence>(src);
-        value.clear();
-        reserve_maybe(s, &value);
-        for (auto it : s) {
-            value_conv conv;
-            if (!conv.load(it, convert))
-                return false;
-            value.push_back(cast_op<Value &&>(std::move(conv)));
-        }
-        return true;
+  bool load(handle src, bool convert) {
+    if (!isinstance<sequence>(src) || isinstance<str>(src))
+      return false;
+    auto s = reinterpret_borrow<sequence>(src);
+    value.clear();
+    reserve_maybe(s, &value);
+    for (auto it : s) {
+      value_conv conv;
+      if (!conv.load(it, convert))
+        return false;
+      value.push_back(cast_op<Value &&>(std::move(conv)));
     }
+    return true;
+  }
 
 private:
-    template <typename T = Type,
-              enable_if_t<std::is_same<decltype(std::declval<T>().reserve(0)), void>::value, int> = 0>
-    void reserve_maybe(sequence s, Type *) { value.reserve(s.size()); }
-    void reserve_maybe(sequence, void *) { }
+  template <typename T = Type,
+            enable_if_t<std::is_same<decltype(std::declval<T>().reserve(0)),
+                                     void>::value,
+                        int> = 0>
+  void reserve_maybe(sequence s, Type *) {
+    value.reserve(s.size());
+  }
+  void reserve_maybe(sequence, void *) {}
 
 public:
-    template <typename T>
-    static handle cast(T &&src, return_value_policy policy, handle parent) {
-        if (!std::is_lvalue_reference<T>::value)
-            policy = return_value_policy_override<Value>::policy(policy);
-        list l(src.size());
-        size_t index = 0;
-        for (auto &&value : src) {
-            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
-            if (!value_)
-                return handle();
-            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
-        }
-        return l.release();
+  template <typename T>
+  static handle cast(T &&src, return_value_policy policy, handle parent) {
+    if (!std::is_lvalue_reference<T>::value)
+      policy = return_value_policy_override<Value>::policy(policy);
+    list l(src.size());
+    size_t index = 0;
+    for (auto &&value : src) {
+      auto value_ = reinterpret_steal<object>(
+          value_conv::cast(forward_like<T>(value), policy, parent));
+      if (!value_)
+        return handle();
+      PyList_SET_ITEM(l.ptr(), (ssize_t)index++,
+                      value_.release().ptr()); // steals a reference
     }
+    return l.release();
+  }
 
-    PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]"));
+  PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]"));
 };
 
-template <typename Type, typename Alloc> struct type_caster<std::vector<Type, Alloc>>
- : list_caster<std::vector<Type, Alloc>, Type> { };
+template <typename Type, typename Alloc>
+struct type_caster<std::vector<Type, Alloc>>
+    : list_caster<std::vector<Type, Alloc>, Type> {};
 
-template <typename Type, typename Alloc> struct type_caster<std::deque<Type, Alloc>>
- : list_caster<std::deque<Type, Alloc>, Type> { };
+template <typename Type, typename Alloc>
+struct type_caster<std::deque<Type, Alloc>>
+    : list_caster<std::deque<Type, Alloc>, Type> {};
 
-template <typename Type, typename Alloc> struct type_caster<std::list<Type, Alloc>>
- : list_caster<std::list<Type, Alloc>, Type> { };
+template <typename Type, typename Alloc>
+struct type_caster<std::list<Type, Alloc>>
+    : list_caster<std::list<Type, Alloc>, Type> {};
 
-template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0> struct array_caster {
-    using value_conv = make_caster<Value>;
+template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0>
+struct array_caster {
+  using value_conv = make_caster<Value>;
 
 private:
-    template <bool R = Resizable>
-    bool require_size(enable_if_t<R, size_t> size) {
-        if (value.size() != size)
-            value.resize(size);
-        return true;
-    }
-    template <bool R = Resizable>
-    bool require_size(enable_if_t<!R, size_t> size) {
-        return size == Size;
-    }
+  template <bool R = Resizable> bool require_size(enable_if_t<R, size_t> size) {
+    if (value.size() != size)
+      value.resize(size);
+    return true;
+  }
+  template <bool R = Resizable>
+  bool require_size(enable_if_t<!R, size_t> size) {
+    return size == Size;
+  }
 
 public:
-    bool load(handle src, bool convert) {
-        if (!isinstance<sequence>(src))
-            return false;
-        auto l = reinterpret_borrow<sequence>(src);
-        if (!require_size(l.size()))
-            return false;
-        size_t ctr = 0;
-        for (auto it : l) {
-            value_conv conv;
-            if (!conv.load(it, convert))
-                return false;
-            value[ctr++] = cast_op<Value &&>(std::move(conv));
-        }
-        return true;
+  bool load(handle src, bool convert) {
+    if (!isinstance<sequence>(src))
+      return false;
+    auto l = reinterpret_borrow<sequence>(src);
+    if (!require_size(l.size()))
+      return false;
+    size_t ctr = 0;
+    for (auto it : l) {
+      value_conv conv;
+      if (!conv.load(it, convert))
+        return false;
+      value[ctr++] = cast_op<Value &&>(std::move(conv));
     }
+    return true;
+  }
 
-    template <typename T>
-    static handle cast(T &&src, return_value_policy policy, handle parent) {
-        list l(src.size());
-        size_t index = 0;
-        for (auto &&value : src) {
-            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
-            if (!value_)
-                return handle();
-            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
-        }
-        return l.release();
+  template <typename T>
+  static handle cast(T &&src, return_value_policy policy, handle parent) {
+    list l(src.size());
+    size_t index = 0;
+    for (auto &&value : src) {
+      auto value_ = reinterpret_steal<object>(
+          value_conv::cast(forward_like<T>(value), policy, parent));
+      if (!value_)
+        return handle();
+      PyList_SET_ITEM(l.ptr(), (ssize_t)index++,
+                      value_.release().ptr()); // steals a reference
     }
+    return l.release();
+  }
 
-    PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name + _<Resizable>(_(""), _("[") + _<Size>() + _("]")) + _("]"));
+  PYBIND11_TYPE_CASTER(ArrayType,
+                       _("List[") + value_conv::name +
+                           _<Resizable>(_(""), _("[") + _<Size>() + _("]")) +
+                           _("]"));
 };
 
-template <typename Type, size_t Size> struct type_caster<std::array<Type, Size>>
- : array_caster<std::array<Type, Size>, Type, false, Size> { };
+template <typename Type, size_t Size>
+struct type_caster<std::array<Type, Size>>
+    : array_caster<std::array<Type, Size>, Type, false, Size> {};
 
-template <typename Type> struct type_caster<std::valarray<Type>>
- : array_caster<std::valarray<Type>, Type, true> { };
+template <typename Type>
+struct type_caster<std::valarray<Type>>
+    : array_caster<std::valarray<Type>, Type, true> {};
 
-template <typename Key, typename Compare, typename Alloc> struct type_caster<std::set<Key, Compare, Alloc>>
-  : set_caster<std::set<Key, Compare, Alloc>, Key> { };
+template <typename Key, typename Compare, typename Alloc>
+struct type_caster<std::set<Key, Compare, Alloc>>
+    : set_caster<std::set<Key, Compare, Alloc>, Key> {};
 
-template <typename Key, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_set<Key, Hash, Equal, Alloc>>
-  : set_caster<std::unordered_set<Key, Hash, Equal, Alloc>, Key> { };
+template <typename Key, typename Hash, typename Equal, typename Alloc>
+struct type_caster<std::unordered_set<Key, Hash, Equal, Alloc>>
+    : set_caster<std::unordered_set<Key, Hash, Equal, Alloc>, Key> {};
 
-template <typename Key, typename Value, typename Compare, typename Alloc> struct type_caster<std::map<Key, Value, Compare, Alloc>>
-  : map_caster<std::map<Key, Value, Compare, Alloc>, Key, Value> { };
+template <typename Key, typename Value, typename Compare, typename Alloc>
+struct type_caster<std::map<Key, Value, Compare, Alloc>>
+    : map_caster<std::map<Key, Value, Compare, Alloc>, Key, Value> {};
 
-template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>>
-  : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key, Value> { };
+template <typename Key, typename Value, typename Hash, typename Equal,
+          typename Alloc>
+struct type_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>>
+    : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key,
+                 Value> {};
 
-// This type caster is intended to be used for std::optional and std::experimental::optional
-template<typename T> struct optional_caster {
-    using value_conv = make_caster<typename T::value_type>;
+// This type caster is intended to be used for std::optional and
+// std::experimental::optional
+template <typename T> struct optional_caster {
+  using value_conv = make_caster<typename T::value_type>;
 
-    template <typename T_>
-    static handle cast(T_ &&src, return_value_policy policy, handle parent) {
-        if (!src)
-            return none().inc_ref();
-        policy = return_value_policy_override<typename T::value_type>::policy(policy);
-        return value_conv::cast(*std::forward<T_>(src), policy, parent);
+  template <typename T_>
+  static handle cast(T_ &&src, return_value_policy policy, handle parent) {
+    if (!src)
+      return none().inc_ref();
+    policy =
+        return_value_policy_override<typename T::value_type>::policy(policy);
+    return value_conv::cast(*std::forward<T_>(src), policy, parent);
+  }
+
+  bool load(handle src, bool convert) {
+    if (!src) {
+      return false;
+    } else if (src.is_none()) {
+      return true; // default-constructed value is already empty
     }
+    value_conv inner_caster;
+    if (!inner_caster.load(src, convert))
+      return false;
 
-    bool load(handle src, bool convert) {
-        if (!src) {
-            return false;
-        } else if (src.is_none()) {
-            return true;  // default-constructed value is already empty
-        }
-        value_conv inner_caster;
-        if (!inner_caster.load(src, convert))
-            return false;
+    value.emplace(cast_op<typename T::value_type &&>(std::move(inner_caster)));
+    return true;
+  }
 
-        value.emplace(cast_op<typename T::value_type &&>(std::move(inner_caster)));
-        return true;
-    }
-
-    PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]"));
+  PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]"));
 };
 
 #if PYBIND11_HAS_OPTIONAL
-template<typename T> struct type_caster<std::optional<T>>
+template <typename T>
+struct type_caster<std::optional<T>>
     : public optional_caster<std::optional<T>> {};
 
-template<> struct type_caster<std::nullopt_t>
-    : public void_caster<std::nullopt_t> {};
+template <>
+struct type_caster<std::nullopt_t> : public void_caster<std::nullopt_t> {};
 #endif
 
 #if PYBIND11_HAS_EXP_OPTIONAL
-template<typename T> struct type_caster<std::experimental::optional<T>>
+template <typename T>
+struct type_caster<std::experimental::optional<T>>
     : public optional_caster<std::experimental::optional<T>> {};
 
-template<> struct type_caster<std::experimental::nullopt_t>
+template <>
+struct type_caster<std::experimental::nullopt_t>
     : public void_caster<std::experimental::nullopt_t> {};
 #endif
 
 /// Visit a variant and cast any found type to Python
 struct variant_caster_visitor {
-    return_value_policy policy;
-    handle parent;
+  return_value_policy policy;
+  handle parent;
 
-    using result_type = handle; // required by boost::variant in C++11
+  using result_type = handle; // required by boost::variant in C++11
 
-    template <typename T>
-    result_type operator()(T &&src) const {
-        return make_caster<T>::cast(std::forward<T>(src), policy, parent);
-    }
+  template <typename T> result_type operator()(T &&src) const {
+    return make_caster<T>::cast(std::forward<T>(src), policy, parent);
+  }
 };
 
-/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar
-/// `namespace::variant` types which provide a `namespace::visit()` function are handled here
-/// automatically using argument-dependent lookup. Users can provide specializations for other
-/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`.
-template <template<typename...> class Variant>
-struct visit_helper {
-    template <typename... Args>
-    static auto call(Args &&...args) -> decltype(visit(std::forward<Args>(args)...)) {
-        return visit(std::forward<Args>(args)...);
-    }
+/// Helper class which abstracts away variant's `visit` function. `std::variant`
+/// and similar `namespace::variant` types which provide a `namespace::visit()`
+/// function are handled here automatically using argument-dependent lookup.
+/// Users can provide specializations for other variant-like classes, e.g.
+/// `boost::variant` and `boost::apply_visitor`.
+template <template <typename...> class Variant> struct visit_helper {
+  template <typename... Args>
+  static auto call(Args &&... args)
+      -> decltype(visit(std::forward<Args>(args)...)) {
+    return visit(std::forward<Args>(args)...);
+  }
 };
 
 /// Generic variant caster
 template <typename Variant> struct variant_caster;
 
-template <template<typename...> class V, typename... Ts>
+template <template <typename...> class V, typename... Ts>
 struct variant_caster<V<Ts...>> {
-    static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative.");
+  static_assert(sizeof...(Ts) > 0,
+                "Variant must consist of at least one alternative.");
 
-    template <typename U, typename... Us>
-    bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
-        auto caster = make_caster<U>();
-        if (caster.load(src, convert)) {
-            value = cast_op<U>(caster);
-            return true;
-        }
-        return load_alternative(src, convert, type_list<Us...>{});
+  template <typename U, typename... Us>
+  bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
+    auto caster = make_caster<U>();
+    if (caster.load(src, convert)) {
+      value = cast_op<U>(caster);
+      return true;
     }
+    return load_alternative(src, convert, type_list<Us...>{});
+  }
 
-    bool load_alternative(handle, bool, type_list<>) { return false; }
+  bool load_alternative(handle, bool, type_list<>) { return false; }
 
-    bool load(handle src, bool convert) {
-        // Do a first pass without conversions to improve constructor resolution.
-        // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
-        // slot of the variant. Without two-pass loading `double` would be filled
-        // because it appears first and a conversion is possible.
-        if (convert && load_alternative(src, false, type_list<Ts...>{}))
-            return true;
-        return load_alternative(src, convert, type_list<Ts...>{});
-    }
+  bool load(handle src, bool convert) {
+    // Do a first pass without conversions to improve constructor resolution.
+    // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
+    // slot of the variant. Without two-pass loading `double` would be filled
+    // because it appears first and a conversion is possible.
+    if (convert && load_alternative(src, false, type_list<Ts...>{}))
+      return true;
+    return load_alternative(src, convert, type_list<Ts...>{});
+  }
 
-    template <typename Variant>
-    static handle cast(Variant &&src, return_value_policy policy, handle parent) {
-        return visit_helper<V>::call(variant_caster_visitor{policy, parent},
-                                     std::forward<Variant>(src));
-    }
+  template <typename Variant>
+  static handle cast(Variant &&src, return_value_policy policy, handle parent) {
+    return visit_helper<V>::call(variant_caster_visitor{policy, parent},
+                                 std::forward<Variant>(src));
+  }
 
-    using Type = V<Ts...>;
-    PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster<Ts>::name...) + _("]"));
+  using Type = V<Ts...>;
+  PYBIND11_TYPE_CASTER(Type, _("Union[") +
+                                 detail::concat(make_caster<Ts>::name...) +
+                                 _("]"));
 };
 
 #if PYBIND11_HAS_VARIANT
 template <typename... Ts>
-struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> { };
+struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> {
+};
 #endif
 
 NAMESPACE_END(detail)
 
 inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
-    os << (std::string) str(obj);
-    return os;
+  os << (std::string)str(obj);
+  return os;
 }
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/stl_bind.h b/python/src/pybind11/stl_bind.h
index 1f8725260..b27c7c127 100644
--- a/python/src/pybind11/stl_bind.h
+++ b/python/src/pybind11/stl_bind.h
@@ -19,422 +19,457 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
 /* SFINAE helper class used by 'is_comparable */
-template <typename T>  struct container_traits {
-    template <typename T2> static std::true_type test_comparable(decltype(std::declval<const T2 &>() == std::declval<const T2 &>())*);
-    template <typename T2> static std::false_type test_comparable(...);
-    template <typename T2> static std::true_type test_value(typename T2::value_type *);
-    template <typename T2> static std::false_type test_value(...);
-    template <typename T2> static std::true_type test_pair(typename T2::first_type *, typename T2::second_type *);
-    template <typename T2> static std::false_type test_pair(...);
+template <typename T> struct container_traits {
+  template <typename T2>
+  static std::true_type test_comparable(decltype(std::declval<const T2 &>() ==
+                                                 std::declval<const T2 &>()) *);
+  template <typename T2> static std::false_type test_comparable(...);
+  template <typename T2>
+  static std::true_type test_value(typename T2::value_type *);
+  template <typename T2> static std::false_type test_value(...);
+  template <typename T2>
+  static std::true_type test_pair(typename T2::first_type *,
+                                  typename T2::second_type *);
+  template <typename T2> static std::false_type test_pair(...);
 
-    static constexpr const bool is_comparable = std::is_same<std::true_type, decltype(test_comparable<T>(nullptr))>::value;
-    static constexpr const bool is_pair = std::is_same<std::true_type, decltype(test_pair<T>(nullptr, nullptr))>::value;
-    static constexpr const bool is_vector = std::is_same<std::true_type, decltype(test_value<T>(nullptr))>::value;
-    static constexpr const bool is_element = !is_pair && !is_vector;
+  static constexpr const bool is_comparable =
+      std::is_same<std::true_type,
+                   decltype(test_comparable<T>(nullptr))>::value;
+  static constexpr const bool is_pair =
+      std::is_same<std::true_type,
+                   decltype(test_pair<T>(nullptr, nullptr))>::value;
+  static constexpr const bool is_vector =
+      std::is_same<std::true_type, decltype(test_value<T>(nullptr))>::value;
+  static constexpr const bool is_element = !is_pair && !is_vector;
 };
 
 /* Default: is_comparable -> std::false_type */
 template <typename T, typename SFINAE = void>
-struct is_comparable : std::false_type { };
+struct is_comparable : std::false_type {};
 
 /* For non-map data structures, check whether operator== can be instantiated */
 template <typename T>
-struct is_comparable<
-    T, enable_if_t<container_traits<T>::is_element &&
-                   container_traits<T>::is_comparable>>
-    : std::true_type { };
+struct is_comparable<T, enable_if_t<container_traits<T>::is_element &&
+                                    container_traits<T>::is_comparable>>
+    : std::true_type {};
 
-/* For a vector/map data structure, recursively check the value type (which is std::pair for maps) */
+/* For a vector/map data structure, recursively check the value type (which is
+ * std::pair for maps) */
 template <typename T>
 struct is_comparable<T, enable_if_t<container_traits<T>::is_vector>> {
-    static constexpr const bool value =
-        is_comparable<typename T::value_type>::value;
+  static constexpr const bool value =
+      is_comparable<typename T::value_type>::value;
 };
 
 /* For pairs, recursively check the two data types */
 template <typename T>
 struct is_comparable<T, enable_if_t<container_traits<T>::is_pair>> {
-    static constexpr const bool value =
-        is_comparable<typename T::first_type>::value &&
-        is_comparable<typename T::second_type>::value;
+  static constexpr const bool value =
+      is_comparable<typename T::first_type>::value &&
+      is_comparable<typename T::second_type>::value;
 };
 
 /* Fallback functions */
-template <typename, typename, typename... Args> void vector_if_copy_constructible(const Args &...) { }
-template <typename, typename, typename... Args> void vector_if_equal_operator(const Args &...) { }
-template <typename, typename, typename... Args> void vector_if_insertion_operator(const Args &...) { }
-template <typename, typename, typename... Args> void vector_modifiers(const Args &...) { }
+template <typename, typename, typename... Args>
+void vector_if_copy_constructible(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_if_equal_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_if_insertion_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_modifiers(const Args &...) {}
 
-template<typename Vector, typename Class_>
-void vector_if_copy_constructible(enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
-    cl.def(init<const Vector &>(), "Copy constructor");
+template <typename Vector, typename Class_>
+void vector_if_copy_constructible(
+    enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
+  cl.def(init<const Vector &>(), "Copy constructor");
 }
 
-template<typename Vector, typename Class_>
-void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_> &cl) {
-    using T = typename Vector::value_type;
+template <typename Vector, typename Class_>
+void vector_if_equal_operator(
+    enable_if_t<is_comparable<Vector>::value, Class_> &cl) {
+  using T = typename Vector::value_type;
 
-    cl.def(self == self);
-    cl.def(self != self);
+  cl.def(self == self);
+  cl.def(self != self);
 
-    cl.def("count",
-        [](const Vector &v, const T &x) {
-            return std::count(v.begin(), v.end(), x);
-        },
-        arg("x"),
-        "Return the number of times ``x`` appears in the list"
-    );
+  cl.def(
+      "count",
+      [](const Vector &v, const T &x) {
+        return std::count(v.begin(), v.end(), x);
+      },
+      arg("x"), "Return the number of times ``x`` appears in the list");
 
-    cl.def("remove", [](Vector &v, const T &x) {
-            auto p = std::find(v.begin(), v.end(), x);
-            if (p != v.end())
-                v.erase(p);
-            else
-                throw value_error();
-        },
-        arg("x"),
-        "Remove the first item from the list whose value is x. "
-        "It is an error if there is no such item."
-    );
+  cl.def(
+      "remove",
+      [](Vector &v, const T &x) {
+        auto p = std::find(v.begin(), v.end(), x);
+        if (p != v.end())
+          v.erase(p);
+        else
+          throw value_error();
+      },
+      arg("x"),
+      "Remove the first item from the list whose value is x. "
+      "It is an error if there is no such item.");
 
-    cl.def("__contains__",
-        [](const Vector &v, const T &x) {
-            return std::find(v.begin(), v.end(), x) != v.end();
-        },
-        arg("x"),
-        "Return true the container contains ``x``"
-    );
+  cl.def(
+      "__contains__",
+      [](const Vector &v, const T &x) {
+        return std::find(v.begin(), v.end(), x) != v.end();
+      },
+      arg("x"), "Return true the container contains ``x``");
 }
 
 // Vector modifiers -- requires a copyable vector_type:
-// (Technically, some of these (pop and __delitem__) don't actually require copyability, but it seems
-// silly to allow deletion but not insertion, so include them here too.)
+// (Technically, some of these (pop and __delitem__) don't actually require
+// copyability, but it seems silly to allow deletion but not insertion, so
+// include them here too.)
 template <typename Vector, typename Class_>
-void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
-    using T = typename Vector::value_type;
-    using SizeType = typename Vector::size_type;
-    using DiffType = typename Vector::difference_type;
+void vector_modifiers(
+    enable_if_t<is_copy_constructible<typename Vector::value_type>::value,
+                Class_> &cl) {
+  using T = typename Vector::value_type;
+  using SizeType = typename Vector::size_type;
+  using DiffType = typename Vector::difference_type;
 
-    cl.def("append",
-           [](Vector &v, const T &value) { v.push_back(value); },
-           arg("x"),
-           "Add an item to the end of the list");
+  cl.def(
+      "append", [](Vector &v, const T &value) { v.push_back(value); }, arg("x"),
+      "Add an item to the end of the list");
 
-    cl.def(init([](iterable it) {
-        auto v = std::unique_ptr<Vector>(new Vector());
-        v->reserve(len_hint(it));
-        for (handle h : it)
-           v->push_back(h.cast<T>());
-        return v.release();
-    }));
+  cl.def(init([](iterable it) {
+    auto v = std::unique_ptr<Vector>(new Vector());
+    v->reserve(len_hint(it));
+    for (handle h : it)
+      v->push_back(h.cast<T>());
+    return v.release();
+  }));
 
-    cl.def("extend",
-       [](Vector &v, const Vector &src) {
-           v.insert(v.end(), src.begin(), src.end());
-       },
-       arg("L"),
-       "Extend the list by appending all the items in the given list"
-    );
+  cl.def(
+      "extend",
+      [](Vector &v, const Vector &src) {
+        v.insert(v.end(), src.begin(), src.end());
+      },
+      arg("L"), "Extend the list by appending all the items in the given list");
 
-    cl.def("extend",
-       [](Vector &v, iterable it) {
-           const size_t old_size = v.size();
-           v.reserve(old_size + len_hint(it));
-           try {
-               for (handle h : it) {
-                   v.push_back(h.cast<T>());
-               }
-           } catch (const cast_error &) {
-               v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size), v.end());
-               try {
-                   v.shrink_to_fit();
-               } catch (const std::exception &) {
-                   // Do nothing
-               }
-               throw;
-           }
-       },
-       arg("L"),
-       "Extend the list by appending all the items in the given list"
-    );
-
-    cl.def("insert",
-        [](Vector &v, SizeType i, const T &x) {
-            if (i > v.size())
-                throw index_error();
-            v.insert(v.begin() + (DiffType) i, x);
-        },
-        arg("i") , arg("x"),
-        "Insert an item at a given position."
-    );
-
-    cl.def("pop",
-        [](Vector &v) {
-            if (v.empty())
-                throw index_error();
-            T t = v.back();
-            v.pop_back();
-            return t;
-        },
-        "Remove and return the last item"
-    );
-
-    cl.def("pop",
-        [](Vector &v, SizeType i) {
-            if (i >= v.size())
-                throw index_error();
-            T t = v[i];
-            v.erase(v.begin() + (DiffType) i);
-            return t;
-        },
-        arg("i"),
-        "Remove and return the item at index ``i``"
-    );
-
-    cl.def("__setitem__",
-        [](Vector &v, SizeType i, const T &t) {
-            if (i >= v.size())
-                throw index_error();
-            v[i] = t;
+  cl.def(
+      "extend",
+      [](Vector &v, iterable it) {
+        const size_t old_size = v.size();
+        v.reserve(old_size + len_hint(it));
+        try {
+          for (handle h : it) {
+            v.push_back(h.cast<T>());
+          }
+        } catch (const cast_error &) {
+          v.erase(v.begin() +
+                      static_cast<typename Vector::difference_type>(old_size),
+                  v.end());
+          try {
+            v.shrink_to_fit();
+          } catch (const std::exception &) {
+            // Do nothing
+          }
+          throw;
         }
-    );
+      },
+      arg("L"), "Extend the list by appending all the items in the given list");
 
-    /// Slicing protocol
-    cl.def("__getitem__",
-        [](const Vector &v, slice slice) -> Vector * {
-            size_t start, stop, step, slicelength;
+  cl.def(
+      "insert",
+      [](Vector &v, SizeType i, const T &x) {
+        if (i > v.size())
+          throw index_error();
+        v.insert(v.begin() + (DiffType)i, x);
+      },
+      arg("i"), arg("x"), "Insert an item at a given position.");
 
-            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
-                throw error_already_set();
+  cl.def(
+      "pop",
+      [](Vector &v) {
+        if (v.empty())
+          throw index_error();
+        T t = v.back();
+        v.pop_back();
+        return t;
+      },
+      "Remove and return the last item");
 
-            Vector *seq = new Vector();
-            seq->reserve((size_t) slicelength);
+  cl.def(
+      "pop",
+      [](Vector &v, SizeType i) {
+        if (i >= v.size())
+          throw index_error();
+        T t = v[i];
+        v.erase(v.begin() + (DiffType)i);
+        return t;
+      },
+      arg("i"), "Remove and return the item at index ``i``");
 
-            for (size_t i=0; i<slicelength; ++i) {
-                seq->push_back(v[start]);
-                start += step;
-            }
-            return seq;
-        },
-        arg("s"),
-        "Retrieve list elements using a slice object"
-    );
+  cl.def("__setitem__", [](Vector &v, SizeType i, const T &t) {
+    if (i >= v.size())
+      throw index_error();
+    v[i] = t;
+  });
 
-    cl.def("__setitem__",
-        [](Vector &v, slice slice,  const Vector &value) {
-            size_t start, stop, step, slicelength;
-            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
-                throw error_already_set();
+  /// Slicing protocol
+  cl.def(
+      "__getitem__",
+      [](const Vector &v, slice slice) -> Vector * {
+        size_t start, stop, step, slicelength;
 
-            if (slicelength != value.size())
-                throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
+        if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+          throw error_already_set();
 
-            for (size_t i=0; i<slicelength; ++i) {
-                v[start] = value[i];
-                start += step;
-            }
-        },
-        "Assign list elements using a slice object"
-    );
+        Vector *seq = new Vector();
+        seq->reserve((size_t)slicelength);
 
-    cl.def("__delitem__",
-        [](Vector &v, SizeType i) {
-            if (i >= v.size())
-                throw index_error();
-            v.erase(v.begin() + DiffType(i));
-        },
-        "Delete the list elements at index ``i``"
-    );
+        for (size_t i = 0; i < slicelength; ++i) {
+          seq->push_back(v[start]);
+          start += step;
+        }
+        return seq;
+      },
+      arg("s"), "Retrieve list elements using a slice object");
 
-    cl.def("__delitem__",
-        [](Vector &v, slice slice) {
-            size_t start, stop, step, slicelength;
+  cl.def(
+      "__setitem__",
+      [](Vector &v, slice slice, const Vector &value) {
+        size_t start, stop, step, slicelength;
+        if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+          throw error_already_set();
 
-            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
-                throw error_already_set();
+        if (slicelength != value.size())
+          throw std::runtime_error("Left and right hand size of slice "
+                                   "assignment have different sizes!");
 
-            if (step == 1 && false) {
-                v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength));
-            } else {
-                for (size_t i = 0; i < slicelength; ++i) {
-                    v.erase(v.begin() + DiffType(start));
-                    start += step - 1;
-                }
-            }
-        },
-        "Delete list elements using a slice object"
-    );
+        for (size_t i = 0; i < slicelength; ++i) {
+          v[start] = value[i];
+          start += step;
+        }
+      },
+      "Assign list elements using a slice object");
 
+  cl.def(
+      "__delitem__",
+      [](Vector &v, SizeType i) {
+        if (i >= v.size())
+          throw index_error();
+        v.erase(v.begin() + DiffType(i));
+      },
+      "Delete the list elements at index ``i``");
+
+  cl.def(
+      "__delitem__",
+      [](Vector &v, slice slice) {
+        size_t start, stop, step, slicelength;
+
+        if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+          throw error_already_set();
+
+        if (step == 1 && false) {
+          v.erase(v.begin() + (DiffType)start,
+                  v.begin() + DiffType(start + slicelength));
+        } else {
+          for (size_t i = 0; i < slicelength; ++i) {
+            v.erase(v.begin() + DiffType(start));
+            start += step - 1;
+          }
+        }
+      },
+      "Delete list elements using a slice object");
 }
 
-// If the type has an operator[] that doesn't return a reference (most notably std::vector<bool>),
-// we have to access by copying; otherwise we return by reference.
-template <typename Vector> using vector_needs_copy = negation<
-    std::is_same<decltype(std::declval<Vector>()[typename Vector::size_type()]), typename Vector::value_type &>>;
+// If the type has an operator[] that doesn't return a reference (most notably
+// std::vector<bool>), we have to access by copying; otherwise we return by
+// reference.
+template <typename Vector>
+using vector_needs_copy = negation<
+    std::is_same<decltype(std::declval<Vector>()[typename Vector::size_type()]),
+                 typename Vector::value_type &>>;
 
 // The usual case: access and iterate by reference
 template <typename Vector, typename Class_>
-void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl) {
-    using T = typename Vector::value_type;
-    using SizeType = typename Vector::size_type;
-    using ItType   = typename Vector::iterator;
+void vector_accessor(
+    enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl) {
+  using T = typename Vector::value_type;
+  using SizeType = typename Vector::size_type;
+  using ItType = typename Vector::iterator;
 
-    cl.def("__getitem__",
-        [](Vector &v, SizeType i) -> T & {
-            if (i >= v.size())
-                throw index_error();
-            return v[i];
-        },
-        return_value_policy::reference_internal // ref + keepalive
-    );
+  cl.def(
+      "__getitem__",
+      [](Vector &v, SizeType i) -> T & {
+        if (i >= v.size())
+          throw index_error();
+        return v[i];
+      },
+      return_value_policy::reference_internal // ref + keepalive
+  );
 
-    cl.def("__iter__",
-           [](Vector &v) {
-               return make_iterator<
-                   return_value_policy::reference_internal, ItType, ItType, T&>(
-                   v.begin(), v.end());
-           },
-           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
-    );
+  cl.def(
+      "__iter__",
+      [](Vector &v) {
+        return make_iterator<return_value_policy::reference_internal, ItType,
+                             ItType, T &>(v.begin(), v.end());
+      },
+      keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+  );
 }
 
-// The case for special objects, like std::vector<bool>, that have to be returned-by-copy:
+// The case for special objects, like std::vector<bool>, that have to be
+// returned-by-copy:
 template <typename Vector, typename Class_>
-void vector_accessor(enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl) {
-    using T = typename Vector::value_type;
-    using SizeType = typename Vector::size_type;
-    using ItType   = typename Vector::iterator;
-    cl.def("__getitem__",
-        [](const Vector &v, SizeType i) -> T {
-            if (i >= v.size())
-                throw index_error();
-            return v[i];
+void vector_accessor(
+    enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl) {
+  using T = typename Vector::value_type;
+  using SizeType = typename Vector::size_type;
+  using ItType = typename Vector::iterator;
+  cl.def("__getitem__", [](const Vector &v, SizeType i) -> T {
+    if (i >= v.size())
+      throw index_error();
+    return v[i];
+  });
+
+  cl.def(
+      "__iter__",
+      [](Vector &v) {
+        return make_iterator<return_value_policy::copy, ItType, ItType, T>(
+            v.begin(), v.end());
+      },
+      keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+  );
+}
+
+template <typename Vector, typename Class_>
+auto vector_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream &>()
+                    << std::declval<typename Vector::value_type>(),
+                void()) {
+  using size_type = typename Vector::size_type;
+
+  cl.def(
+      "__repr__",
+      [name](Vector &v) {
+        std::ostringstream s;
+        s << name << '[';
+        for (size_type i = 0; i < v.size(); ++i) {
+          s << v[i];
+          if (i != v.size() - 1)
+            s << ", ";
         }
-    );
-
-    cl.def("__iter__",
-           [](Vector &v) {
-               return make_iterator<
-                   return_value_policy::copy, ItType, ItType, T>(
-                   v.begin(), v.end());
-           },
-           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
-    );
+        s << ']';
+        return s.str();
+      },
+      "Return the canonical string representation of this list.");
 }
 
-template <typename Vector, typename Class_> auto vector_if_insertion_operator(Class_ &cl, std::string const &name)
-    -> decltype(std::declval<std::ostream&>() << std::declval<typename Vector::value_type>(), void()) {
-    using size_type = typename Vector::size_type;
-
-    cl.def("__repr__",
-           [name](Vector &v) {
-            std::ostringstream s;
-            s << name << '[';
-            for (size_type i=0; i < v.size(); ++i) {
-                s << v[i];
-                if (i != v.size() - 1)
-                    s << ", ";
-            }
-            s << ']';
-            return s.str();
-        },
-        "Return the canonical string representation of this list."
-    );
-}
-
-// Provide the buffer interface for vectors if we have data() and we have a format for it
-// GCC seems to have "void std::vector<bool>::data()" - doing SFINAE on the existence of data() is insufficient, we need to check it returns an appropriate pointer
+// Provide the buffer interface for vectors if we have data() and we have a
+// format for it GCC seems to have "void std::vector<bool>::data()" - doing
+// SFINAE on the existence of data() is insufficient, we need to check it
+// returns an appropriate pointer
 template <typename Vector, typename = void>
 struct vector_has_data_and_format : std::false_type {};
 template <typename Vector>
-struct vector_has_data_and_format<Vector, enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(), std::declval<Vector>().data()), typename Vector::value_type*>::value>> : std::true_type {};
+struct vector_has_data_and_format<
+    Vector,
+    enable_if_t<std::is_same<
+        decltype(format_descriptor<typename Vector::value_type>::format(),
+                 std::declval<Vector>().data()),
+        typename Vector::value_type *>::value>> : std::true_type {};
 
 // Add the buffer interface to a vector
 template <typename Vector, typename Class_, typename... Args>
 enable_if_t<detail::any_of<std::is_same<Args, buffer_protocol>...>::value>
-vector_buffer(Class_& cl) {
-    using T = typename Vector::value_type;
+vector_buffer(Class_ &cl) {
+  using T = typename Vector::value_type;
 
-    static_assert(vector_has_data_and_format<Vector>::value, "There is not an appropriate format descriptor for this vector");
+  static_assert(
+      vector_has_data_and_format<Vector>::value,
+      "There is not an appropriate format descriptor for this vector");
 
-    // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here
-    format_descriptor<T>::format();
+  // numpy.h declares this for arbitrary types, but it may raise an exception
+  // and crash hard at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so
+  // check here
+  format_descriptor<T>::format();
 
-    cl.def_buffer([](Vector& v) -> buffer_info {
-        return buffer_info(v.data(), static_cast<ssize_t>(sizeof(T)), format_descriptor<T>::format(), 1, {v.size()}, {sizeof(T)});
-    });
+  cl.def_buffer([](Vector &v) -> buffer_info {
+    return buffer_info(v.data(), static_cast<ssize_t>(sizeof(T)),
+                       format_descriptor<T>::format(), 1, {v.size()},
+                       {sizeof(T)});
+  });
 
-    cl.def(init([](buffer buf) {
-        auto info = buf.request();
-        if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T)))
-            throw type_error("Only valid 1D buffers can be copied to a vector");
-        if (!detail::compare_buffer_info<T>::compare(info) || (ssize_t) sizeof(T) != info.itemsize)
-            throw type_error("Format mismatch (Python: " + info.format + " C++: " + format_descriptor<T>::format() + ")");
+  cl.def(init([](buffer buf) {
+    auto info = buf.request();
+    if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T)))
+      throw type_error("Only valid 1D buffers can be copied to a vector");
+    if (!detail::compare_buffer_info<T>::compare(info) ||
+        (ssize_t)sizeof(T) != info.itemsize)
+      throw type_error("Format mismatch (Python: " + info.format +
+                       " C++: " + format_descriptor<T>::format() + ")");
 
-        auto vec = std::unique_ptr<Vector>(new Vector());
-        vec->reserve((size_t) info.shape[0]);
-        T *p = static_cast<T*>(info.ptr);
-        ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
-        T *end = p + info.shape[0] * step;
-        for (; p != end; p += step)
-            vec->push_back(*p);
-        return vec.release();
-    }));
+    auto vec = std::unique_ptr<Vector>(new Vector());
+    vec->reserve((size_t)info.shape[0]);
+    T *p = static_cast<T *>(info.ptr);
+    ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
+    T *end = p + info.shape[0] * step;
+    for (; p != end; p += step)
+      vec->push_back(*p);
+    return vec.release();
+  }));
 
-    return;
+  return;
 }
 
 template <typename Vector, typename Class_, typename... Args>
-enable_if_t<!detail::any_of<std::is_same<Args, buffer_protocol>...>::value> vector_buffer(Class_&) {}
+enable_if_t<!detail::any_of<std::is_same<Args, buffer_protocol>...>::value>
+vector_buffer(Class_ &) {}
 
 NAMESPACE_END(detail)
 
 //
 // std::vector
 //
-template <typename Vector, typename holder_type = std::unique_ptr<Vector>, typename... Args>
-class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, Args&&... args) {
-    using Class_ = class_<Vector, holder_type>;
+template <typename Vector, typename holder_type = std::unique_ptr<Vector>,
+          typename... Args>
+class_<Vector, holder_type> bind_vector(handle scope, std::string const &name,
+                                        Args &&... args) {
+  using Class_ = class_<Vector, holder_type>;
 
-    // If the value_type is unregistered (e.g. a converting type) or is itself registered
-    // module-local then make the vector binding module-local as well:
-    using vtype = typename Vector::value_type;
-    auto vtype_info = detail::get_type_info(typeid(vtype));
-    bool local = !vtype_info || vtype_info->module_local;
+  // If the value_type is unregistered (e.g. a converting type) or is itself
+  // registered module-local then make the vector binding module-local as well:
+  using vtype = typename Vector::value_type;
+  auto vtype_info = detail::get_type_info(typeid(vtype));
+  bool local = !vtype_info || vtype_info->module_local;
 
-    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+  Class_ cl(scope, name.c_str(), pybind11::module_local(local),
+            std::forward<Args>(args)...);
 
-    // Declare the buffer interface if a buffer_protocol() is passed in
-    detail::vector_buffer<Vector, Class_, Args...>(cl);
+  // Declare the buffer interface if a buffer_protocol() is passed in
+  detail::vector_buffer<Vector, Class_, Args...>(cl);
 
-    cl.def(init<>());
+  cl.def(init<>());
 
-    // Register copy constructor (if possible)
-    detail::vector_if_copy_constructible<Vector, Class_>(cl);
+  // Register copy constructor (if possible)
+  detail::vector_if_copy_constructible<Vector, Class_>(cl);
 
-    // Register comparison-related operators and functions (if possible)
-    detail::vector_if_equal_operator<Vector, Class_>(cl);
+  // Register comparison-related operators and functions (if possible)
+  detail::vector_if_equal_operator<Vector, Class_>(cl);
 
-    // Register stream insertion operator (if possible)
-    detail::vector_if_insertion_operator<Vector, Class_>(cl, name);
+  // Register stream insertion operator (if possible)
+  detail::vector_if_insertion_operator<Vector, Class_>(cl, name);
 
-    // Modifiers require copyable vector value type
-    detail::vector_modifiers<Vector, Class_>(cl);
-
-    // Accessor and iterator; return by value if copyable, otherwise we return by ref + keep-alive
-    detail::vector_accessor<Vector, Class_>(cl);
-
-    cl.def("__bool__",
-        [](const Vector &v) -> bool {
-            return !v.empty();
-        },
-        "Check whether the list is nonempty"
-    );
-
-    cl.def("__len__", &Vector::size);
+  // Modifiers require copyable vector value type
+  detail::vector_modifiers<Vector, Class_>(cl);
 
+  // Accessor and iterator; return by value if copyable, otherwise we return by
+  // ref + keep-alive
+  detail::vector_accessor<Vector, Class_>(cl);
 
+  cl.def(
+      "__bool__", [](const Vector &v) -> bool { return !v.empty(); },
+      "Check whether the list is nonempty");
 
+  cl.def("__len__", &Vector::size);
 
 #if 0
     // C++ style functions deprecated, leaving it here as an example
@@ -476,11 +511,9 @@ class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, A
 
 #endif
 
-    return cl;
+  return cl;
 }
 
-
-
 //
 // std::map, std::unordered_map
 //
@@ -488,143 +521,149 @@ class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, A
 NAMESPACE_BEGIN(detail)
 
 /* Fallback functions */
-template <typename, typename, typename... Args> void map_if_insertion_operator(const Args &...) { }
-template <typename, typename, typename... Args> void map_assignment(const Args &...) { }
+template <typename, typename, typename... Args>
+void map_if_insertion_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void map_assignment(const Args &...) {}
 
 // Map assignment when copy-assignable: just copy the value
 template <typename Map, typename Class_>
-void map_assignment(enable_if_t<std::is_copy_assignable<typename Map::mapped_type>::value, Class_> &cl) {
-    using KeyType = typename Map::key_type;
-    using MappedType = typename Map::mapped_type;
+void map_assignment(
+    enable_if_t<std::is_copy_assignable<typename Map::mapped_type>::value,
+                Class_> &cl) {
+  using KeyType = typename Map::key_type;
+  using MappedType = typename Map::mapped_type;
 
-    cl.def("__setitem__",
-           [](Map &m, const KeyType &k, const MappedType &v) {
-               auto it = m.find(k);
-               if (it != m.end()) it->second = v;
-               else m.emplace(k, v);
-           }
-    );
+  cl.def("__setitem__", [](Map &m, const KeyType &k, const MappedType &v) {
+    auto it = m.find(k);
+    if (it != m.end())
+      it->second = v;
+    else
+      m.emplace(k, v);
+  });
 }
 
-// Not copy-assignable, but still copy-constructible: we can update the value by erasing and reinserting
-template<typename Map, typename Class_>
-void map_assignment(enable_if_t<
-        !std::is_copy_assignable<typename Map::mapped_type>::value &&
-        is_copy_constructible<typename Map::mapped_type>::value,
-        Class_> &cl) {
-    using KeyType = typename Map::key_type;
-    using MappedType = typename Map::mapped_type;
+// Not copy-assignable, but still copy-constructible: we can update the value by
+// erasing and reinserting
+template <typename Map, typename Class_>
+void map_assignment(
+    enable_if_t<!std::is_copy_assignable<typename Map::mapped_type>::value &&
+                    is_copy_constructible<typename Map::mapped_type>::value,
+                Class_> &cl) {
+  using KeyType = typename Map::key_type;
+  using MappedType = typename Map::mapped_type;
 
-    cl.def("__setitem__",
-           [](Map &m, const KeyType &k, const MappedType &v) {
-               // We can't use m[k] = v; because value type might not be default constructable
-               auto r = m.emplace(k, v);
-               if (!r.second) {
-                   // value type is not copy assignable so the only way to insert it is to erase it first...
-                   m.erase(r.first);
-                   m.emplace(k, v);
-               }
-           }
-    );
+  cl.def("__setitem__", [](Map &m, const KeyType &k, const MappedType &v) {
+    // We can't use m[k] = v; because value type might not be default
+    // constructable
+    auto r = m.emplace(k, v);
+    if (!r.second) {
+      // value type is not copy assignable so the only way to insert it is to
+      // erase it first...
+      m.erase(r.first);
+      m.emplace(k, v);
+    }
+  });
 }
 
+template <typename Map, typename Class_>
+auto map_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream &>()
+                    << std::declval<typename Map::key_type>()
+                    << std::declval<typename Map::mapped_type>(),
+                void()) {
 
-template <typename Map, typename Class_> auto map_if_insertion_operator(Class_ &cl, std::string const &name)
--> decltype(std::declval<std::ostream&>() << std::declval<typename Map::key_type>() << std::declval<typename Map::mapped_type>(), void()) {
-
-    cl.def("__repr__",
-           [name](Map &m) {
-            std::ostringstream s;
-            s << name << '{';
-            bool f = false;
-            for (auto const &kv : m) {
-                if (f)
-                    s << ", ";
-                s << kv.first << ": " << kv.second;
-                f = true;
-            }
-            s << '}';
-            return s.str();
-        },
-        "Return the canonical string representation of this map."
-    );
+  cl.def(
+      "__repr__",
+      [name](Map &m) {
+        std::ostringstream s;
+        s << name << '{';
+        bool f = false;
+        for (auto const &kv : m) {
+          if (f)
+            s << ", ";
+          s << kv.first << ": " << kv.second;
+          f = true;
+        }
+        s << '}';
+        return s.str();
+      },
+      "Return the canonical string representation of this map.");
 }
 
-
 NAMESPACE_END(detail)
 
-template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
-class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&... args) {
-    using KeyType = typename Map::key_type;
-    using MappedType = typename Map::mapped_type;
-    using Class_ = class_<Map, holder_type>;
+template <typename Map, typename holder_type = std::unique_ptr<Map>,
+          typename... Args>
+class_<Map, holder_type> bind_map(handle scope, const std::string &name,
+                                  Args &&... args) {
+  using KeyType = typename Map::key_type;
+  using MappedType = typename Map::mapped_type;
+  using Class_ = class_<Map, holder_type>;
 
-    // If either type is a non-module-local bound type then make the map binding non-local as well;
-    // otherwise (e.g. both types are either module-local or converting) the map will be
-    // module-local.
-    auto tinfo = detail::get_type_info(typeid(MappedType));
-    bool local = !tinfo || tinfo->module_local;
-    if (local) {
-        tinfo = detail::get_type_info(typeid(KeyType));
-        local = !tinfo || tinfo->module_local;
-    }
+  // If either type is a non-module-local bound type then make the map binding
+  // non-local as well; otherwise (e.g. both types are either module-local or
+  // converting) the map will be module-local.
+  auto tinfo = detail::get_type_info(typeid(MappedType));
+  bool local = !tinfo || tinfo->module_local;
+  if (local) {
+    tinfo = detail::get_type_info(typeid(KeyType));
+    local = !tinfo || tinfo->module_local;
+  }
 
-    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+  Class_ cl(scope, name.c_str(), pybind11::module_local(local),
+            std::forward<Args>(args)...);
 
-    cl.def(init<>());
+  cl.def(init<>());
 
-    // Register stream insertion operator (if possible)
-    detail::map_if_insertion_operator<Map, Class_>(cl, name);
+  // Register stream insertion operator (if possible)
+  detail::map_if_insertion_operator<Map, Class_>(cl, name);
 
-    cl.def("__bool__",
-        [](const Map &m) -> bool { return !m.empty(); },
-        "Check whether the map is nonempty"
-    );
+  cl.def(
+      "__bool__", [](const Map &m) -> bool { return !m.empty(); },
+      "Check whether the map is nonempty");
 
-    cl.def("__iter__",
-           [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
-           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
-    );
+  cl.def(
+      "__iter__", [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
+      keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+  );
 
-    cl.def("items",
-           [](Map &m) { return make_iterator(m.begin(), m.end()); },
-           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
-    );
+  cl.def(
+      "items", [](Map &m) { return make_iterator(m.begin(), m.end()); },
+      keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+  );
 
-    cl.def("__getitem__",
-        [](Map &m, const KeyType &k) -> MappedType & {
-            auto it = m.find(k);
-            if (it == m.end())
-              throw key_error();
-           return it->second;
-        },
-        return_value_policy::reference_internal // ref + keepalive
-    );
+  cl.def(
+      "__getitem__",
+      [](Map &m, const KeyType &k) -> MappedType & {
+        auto it = m.find(k);
+        if (it == m.end())
+          throw key_error();
+        return it->second;
+      },
+      return_value_policy::reference_internal // ref + keepalive
+  );
 
-    cl.def("__contains__",
-        [](Map &m, const KeyType &k) -> bool {
-            auto it = m.find(k);
-            if (it == m.end())
-              return false;
-           return true;
-        }
-    );
+  cl.def("__contains__", [](Map &m, const KeyType &k) -> bool {
+    auto it = m.find(k);
+    if (it == m.end())
+      return false;
+    return true;
+  });
 
-    // Assignment provided only if the type is copyable
-    detail::map_assignment<Map, Class_>(cl);
+  // Assignment provided only if the type is copyable
+  detail::map_assignment<Map, Class_>(cl);
 
-    cl.def("__delitem__",
-           [](Map &m, const KeyType &k) {
-               auto it = m.find(k);
-               if (it == m.end())
-                   throw key_error();
-               m.erase(it);
-           }
-    );
+  cl.def("__delitem__", [](Map &m, const KeyType &k) {
+    auto it = m.find(k);
+    if (it == m.end())
+      throw key_error();
+    m.erase(it);
+  });
 
-    cl.def("__len__", &Map::size);
+  cl.def("__len__", &Map::size);
 
-    return cl;
+  return cl;
 }
 
 NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/triton.cc b/python/src/triton.cc
index d26c4faf6..b2ecbdd6b 100644
--- a/python/src/triton.cc
+++ b/python/src/triton.cc
@@ -11,26 +11,25 @@
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 
-
 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/Triton/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 
 #include "llvm/Support/raw_ostream.h"
 
+#include "Python.h"
 #include <optional>
 #include <pybind11/buffer_info.h>
 #include <pybind11/functional.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
 #include <pybind11/stl.h>
-#include "Python.h"
+#include <pybind11/stl_bind.h>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
@@ -40,20 +39,17 @@ namespace py = pybind11;
 // namespace ir = triton::ir;
 namespace drv = triton::driver;
 
-
 /*****************************************************************************/
 /* Python bindings for triton::driver                                        */
 /*****************************************************************************/
 // information query
-template<CUdevice_attribute attr>
-int cuGetInfo(CUdevice device) {
+template <CUdevice_attribute attr> int cuGetInfo(CUdevice device) {
   int res;
   drv::dispatch::cuDeviceGetAttribute(&res, attr, device);
   return res;
 }
 
-template<hipDeviceAttribute_t attr>
-int hipGetInfo(hipDevice_t device) {
+template <hipDeviceAttribute_t attr> int hipGetInfo(hipDevice_t device) {
   int res;
   drv::dispatch::hipDeviceGetAttribute(&res, attr, device);
   return res;
@@ -65,69 +61,71 @@ enum backend_t {
   ROCM,
 };
 
-void cu_enable_peer_access(uint64_t peer_ptr){
+void cu_enable_peer_access(uint64_t peer_ptr) {
   CUcontext context;
-  drv::dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_ptr);
+  drv::dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT,
+                                       peer_ptr);
   try {
-      drv::dispatch::cuCtxEnablePeerAccess(context, 0);
-  } catch (drv::exception::cuda::peer_access_already_enabled) {}
+    drv::dispatch::cuCtxEnablePeerAccess(context, 0);
+  } catch (drv::exception::cuda::peer_access_already_enabled) {
+  }
 }
 
-void host_enqueue(uint64_t stream, uint64_t kernel,
-                  uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
-                  uint64_t block_0, uint64_t block_1, uint64_t block_2,
-                  void* args_ptr, size_t args_size, int64_t shared_mem){
+void host_enqueue(uint64_t stream, uint64_t kernel, uint64_t grid_0,
+                  uint64_t grid_1, uint64_t grid_2, uint64_t block_0,
+                  uint64_t block_1, uint64_t block_2, void *args_ptr,
+                  size_t args_size, int64_t shared_mem) {
   throw std::runtime_error("unsupported");
-// auto hst = kernel->module()->hst();
-// hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
-// char* params = new char[args_size];
-// std::memcpy((void*)params, (void*)args, args_size);
-// for(size_t i = 0; i < grid[0]; i++)
-//   for(size_t j = 0; j < grid[1]; j++)
-//     for(size_t k = 0; k < grid[2]; k++)
-//       hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
+  // auto hst = kernel->module()->hst();
+  // hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
+  // char* params = new char[args_size];
+  // std::memcpy((void*)params, (void*)args, args_size);
+  // for(size_t i = 0; i < grid[0]; i++)
+  //   for(size_t j = 0; j < grid[1]; j++)
+  //     for(size_t k = 0; k < grid[2]; k++)
+  //       hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn,
+  //       (char**)params, int32_t(i), int32_t(j), int32_t(k)));
 }
 
-void cu_enqueue(uint64_t stream, uint64_t kernel,
-                uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
-                uint64_t block_0, uint64_t block_1, uint64_t block_2,
-                void* args_ptr, size_t args_size, int64_t shared_mem){
-  void *config[] = {
-      CU_LAUNCH_PARAM_BUFFER_POINTER, (void*)args_ptr,
-      CU_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
-      CU_LAUNCH_PARAM_END
-  };
-  drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2, 
-                                block_0, block_1, block_2, 
-                                shared_mem, (CUstream)stream, nullptr, config);
+void cu_enqueue(uint64_t stream, uint64_t kernel, uint64_t grid_0,
+                uint64_t grid_1, uint64_t grid_2, uint64_t block_0,
+                uint64_t block_1, uint64_t block_2, void *args_ptr,
+                size_t args_size, int64_t shared_mem) {
+  void *config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, (void *)args_ptr,
+                    CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
+                    CU_LAUNCH_PARAM_END};
+  drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2,
+                                block_0, block_1, block_2, shared_mem,
+                                (CUstream)stream, nullptr, config);
 }
 
-void hip_enqueue(uint64_t stream, uint64_t kernel,
-                uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
-                uint64_t block_0, uint64_t block_1, uint64_t block_2,
-                void* args_ptr, size_t args_size, int64_t shared_mem) {
-  void *config[] = {
-      HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)args_ptr,
-      HIP_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
-      HIP_LAUNCH_PARAM_END
-  };
-  drv::dispatch::hipModuleLaunchKernel((hipFunction_t)kernel, grid_0, grid_1, grid_2, 
-                                block_0, block_1, block_2, 
-                                shared_mem, (hipStream_t)stream, nullptr, config);
-
+void hip_enqueue(uint64_t stream, uint64_t kernel, uint64_t grid_0,
+                 uint64_t grid_1, uint64_t grid_2, uint64_t block_0,
+                 uint64_t block_1, uint64_t block_2, void *args_ptr,
+                 size_t args_size, int64_t shared_mem) {
+  void *config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, (void *)args_ptr,
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
+                    HIP_LAUNCH_PARAM_END};
+  drv::dispatch::hipModuleLaunchKernel(
+      (hipFunction_t)kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2,
+      shared_mem, (hipStream_t)stream, nullptr, config);
 }
 
-long pow2_divisor(long N){
-    if(N % 16 == 0) return 16;
-    if(N % 8 == 0) return 8;
-    if(N % 4 == 0) return 4;
-    if(N % 2 == 0) return 2;
-    return 1;
+long pow2_divisor(long N) {
+  if (N % 16 == 0)
+    return 16;
+  if (N % 8 == 0)
+    return 8;
+  if (N % 4 == 0)
+    return 4;
+  if (N % 2 == 0)
+    return 2;
+  return 1;
 }
 
 // Returns something like "int16", whether dtype is a torch.dtype or
 // triton.language.dtype.
-std::string dtype_cache_key_part(const py::object& dtype) {
+std::string dtype_cache_key_part(const py::object &dtype) {
   if (py::hasattr(dtype, "cache_key_part")) {
     // Presumed to be a triton.language.dtype.
     return std::string(py::str(py::getattr(dtype, "cache_key_part")));
@@ -135,140 +133,150 @@ std::string dtype_cache_key_part(const py::object& dtype) {
     // Remove 'torch.' prefix from repr of torch.dtype.
     py::object repr = py::repr(dtype);
     size_t repr_len = PyUnicode_GET_LENGTH(repr.ptr());
-    const char* repr_ptr = (const char*)PyUnicode_1BYTE_DATA(repr.ptr());
+    const char *repr_ptr = (const char *)PyUnicode_1BYTE_DATA(repr.ptr());
     if (repr_len <= 6 || strncmp(repr_ptr, "torch.", 6)) {
-      throw std::logic_error("invalid dtype: " + std::string(repr_ptr, repr_len));
+      throw std::logic_error("invalid dtype: " +
+                             std::string(repr_ptr, repr_len));
     }
     return std::string(repr_ptr + 6, repr_len - 6);
   }
 }
 
-size_t get_pointer_range_size(uint64_t addr){
-  if(addr == 0)
+size_t get_pointer_range_size(uint64_t addr) {
+  if (addr == 0)
     return 0;
   size_t size;
-  drv::dispatch::cuPointerGetAttribute(&size, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)addr);
+  drv::dispatch::cuPointerGetAttribute(&size, CU_POINTER_ATTRIBUTE_RANGE_SIZE,
+                                       (CUdeviceptr)addr);
   return size;
 }
 
 // Launch
-void parse_args(py::list& args, py::list do_not_specialize, const std::string& func_key, py::list& arg_names,
-                std::string& cache_key, std::string& params, size_t& params_size, py::dict constants,
-                int num_warps, int num_stages) {
-    size_t len = PyList_Size(args.ptr());
-    params.reserve(8*len); // 8 max bytes by argument
-    char* params_ptr = &params[0];
-    cache_key = func_key;
-    cache_key += "-" + std::to_string(num_warps);
-    cache_key += "-" + std::to_string(num_stages);
-    cache_key += "-";
-    for(int i = 0; i < len; i++){
-      cache_key += "_";
-      py::int_ py_i = py::int_(i);
-      bool specialize = !do_not_specialize.contains(py_i);
-      py::object arg = args[i];
-      auto arg_ptr = arg.ptr();
+void parse_args(py::list &args, py::list do_not_specialize,
+                const std::string &func_key, py::list &arg_names,
+                std::string &cache_key, std::string &params,
+                size_t &params_size, py::dict constants, int num_warps,
+                int num_stages) {
+  size_t len = PyList_Size(args.ptr());
+  params.reserve(8 * len); // 8 max bytes by argument
+  char *params_ptr = &params[0];
+  cache_key = func_key;
+  cache_key += "-" + std::to_string(num_warps);
+  cache_key += "-" + std::to_string(num_stages);
+  cache_key += "-";
+  for (int i = 0; i < len; i++) {
+    cache_key += "_";
+    py::int_ py_i = py::int_(i);
+    bool specialize = !do_not_specialize.contains(py_i);
+    py::object arg = args[i];
+    auto arg_ptr = arg.ptr();
 
-      // argument is `long`
-      if(PyLong_Check(arg_ptr)){
-        int overflow;
-        long long value = PyLong_AsLongLongAndOverflow(arg_ptr, &overflow);
-        // values equal to 1 are specialized
-        if(specialize && (value == 1)){
-          cache_key += "1";
-          continue;
-        }
-        // int32, uint32, int64, and uint64 have different kernels
-        if (!overflow && -0x8000'0000LL <= value && value <= 0x7FFF'FFFFLL) {
-          cache_key += "int32";
-          params_ptr = (char*)(((uintptr_t)params_ptr + 3) & (-4));
-          std::memcpy(params_ptr, &value, 4);
-          params_ptr += 4;
-        } else if (!overflow && 0x8000'0000LL <= value && value <= 0xFFFF'FFFFLL) {
-          cache_key += "uint32";
-          params_ptr = (char*)(((uintptr_t)params_ptr + 3) & (-4));
-          std::memcpy(params_ptr, &value, 4);
-          params_ptr += 4;
-        } else if (!overflow) {
-          cache_key += "int64";
-          params_ptr = (char*)(((uintptr_t)params_ptr + 7) & (-8));
-          std::memcpy(params_ptr, &value, 8);
-          params_ptr += 8;
-        } else {
-          if (PyErr_Occurred()) {
-            throw std::logic_error("An error occurred?");
-          }
-          unsigned long long unsigned_value = PyLong_AsUnsignedLongLong(arg_ptr);
-          if (PyErr_Occurred()) {
-            throw std::runtime_error("integer overflow in argument: " + std::string(py::str(arg)));
-          }
-          cache_key += "uint64";
-          params_ptr = (char*)(((uintptr_t)params_ptr + 7) & (-8));
-          std::memcpy(params_ptr, &unsigned_value, 8);
-          params_ptr += 8;
-        }
-        if(!specialize)
-          continue;
-        // values divisible by small powers of 2 are specialized
-        cache_key += "[multipleof(";
-        cache_key += std::to_string(pow2_divisor(value));
-        cache_key += ")]";
+    // argument is `long`
+    if (PyLong_Check(arg_ptr)) {
+      int overflow;
+      long long value = PyLong_AsLongLongAndOverflow(arg_ptr, &overflow);
+      // values equal to 1 are specialized
+      if (specialize && (value == 1)) {
+        cache_key += "1";
         continue;
       }
-      // argument is `float`
-      if(PyFloat_Check(arg_ptr)){
-        cache_key += "float32";
-        float value = PyFloat_AsDouble(arg_ptr);
-        params_ptr = (char*)(((uintptr_t)params_ptr + 3) & (-4));
+      // int32, uint32, int64, and uint64 have different kernels
+      if (!overflow && -0x8000'0000LL <= value && value <= 0x7FFF'FFFFLL) {
+        cache_key += "int32";
+        params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4));
         std::memcpy(params_ptr, &value, 4);
         params_ptr += 4;
-        continue;
-      }
-      // argument is `bool`
-      if(PyBool_Check(arg_ptr)){
-        cache_key += "bool";
-        bool value =  arg_ptr == Py_True ? true : false;
-        std::memcpy(params_ptr, &value, 1);
-        params_ptr += 1;
-        continue;
-      }
-      // argument is tensor
-      if(py::hasattr(arg, "data_ptr")){
-        py::object data_ptr = arg.attr("data_ptr")();
-        long value = data_ptr.cast<long>();
-        params_ptr = (char*)(((uintptr_t)params_ptr + 7) & (-8));
-        // copy param
+      } else if (!overflow && 0x8000'0000LL <= value &&
+                 value <= 0xFFFF'FFFFLL) {
+        cache_key += "uint32";
+        params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4));
+        std::memcpy(params_ptr, &value, 4);
+        params_ptr += 4;
+      } else if (!overflow) {
+        cache_key += "int64";
+        params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8));
         std::memcpy(params_ptr, &value, 8);
         params_ptr += 8;
-        // udpate cache key
-        cache_key += dtype_cache_key_part(arg.attr("dtype"));
-        cache_key += "*";
-        cache_key += "[multipleof(";
-        size_t range_size = get_pointer_range_size(value);
-        cache_key += std::to_string(std::min(pow2_divisor(value), pow2_divisor(range_size)));
-        cache_key += ")]";
-        continue;
+      } else {
+        if (PyErr_Occurred()) {
+          throw std::logic_error("An error occurred?");
+        }
+        unsigned long long unsigned_value = PyLong_AsUnsignedLongLong(arg_ptr);
+        if (PyErr_Occurred()) {
+          throw std::runtime_error("integer overflow in argument: " +
+                                   std::string(py::str(arg)));
+        }
+        cache_key += "uint64";
+        params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8));
+        std::memcpy(params_ptr, &unsigned_value, 8);
+        params_ptr += 8;
       }
-      // argument is `constexpr`
-      if(py::hasattr(arg, "value")){
-        py::object value = arg.attr("value");
-        py::object name = arg_names[i];
-        constants[name] = value;
-        py::object repr = py::repr(value);
-        const char* start = (const char*)PyUnicode_1BYTE_DATA(repr.ptr());
-        size_t len = PyUnicode_GET_LENGTH(repr.ptr());
-        cache_key += std::string(start, len);
+      if (!specialize)
         continue;
-      }
-      std::string ty_str = arg.attr("__class__").attr("__name__").cast<std::string>();
-      if(ty_str == "NoneType"){
-        cache_key += "None";
-        continue;
-      }
-      std::string err_msg = "Received type '" + ty_str + "' for argument " + std::to_string(i) + "."
-                            + " Only int, float, bool, torch.Tensor, and triton.language.constexpr are supported.";
-      throw std::runtime_error(err_msg);
+      // values divisible by small powers of 2 are specialized
+      cache_key += "[multipleof(";
+      cache_key += std::to_string(pow2_divisor(value));
+      cache_key += ")]";
+      continue;
     }
+    // argument is `float`
+    if (PyFloat_Check(arg_ptr)) {
+      cache_key += "float32";
+      float value = PyFloat_AsDouble(arg_ptr);
+      params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4));
+      std::memcpy(params_ptr, &value, 4);
+      params_ptr += 4;
+      continue;
+    }
+    // argument is `bool`
+    if (PyBool_Check(arg_ptr)) {
+      cache_key += "bool";
+      bool value = arg_ptr == Py_True ? true : false;
+      std::memcpy(params_ptr, &value, 1);
+      params_ptr += 1;
+      continue;
+    }
+    // argument is tensor
+    if (py::hasattr(arg, "data_ptr")) {
+      py::object data_ptr = arg.attr("data_ptr")();
+      long value = data_ptr.cast<long>();
+      params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8));
+      // copy param
+      std::memcpy(params_ptr, &value, 8);
+      params_ptr += 8;
+      // udpate cache key
+      cache_key += dtype_cache_key_part(arg.attr("dtype"));
+      cache_key += "*";
+      cache_key += "[multipleof(";
+      size_t range_size = get_pointer_range_size(value);
+      cache_key += std::to_string(
+          std::min(pow2_divisor(value), pow2_divisor(range_size)));
+      cache_key += ")]";
+      continue;
+    }
+    // argument is `constexpr`
+    if (py::hasattr(arg, "value")) {
+      py::object value = arg.attr("value");
+      py::object name = arg_names[i];
+      constants[name] = value;
+      py::object repr = py::repr(value);
+      const char *start = (const char *)PyUnicode_1BYTE_DATA(repr.ptr());
+      size_t len = PyUnicode_GET_LENGTH(repr.ptr());
+      cache_key += std::string(start, len);
+      continue;
+    }
+    std::string ty_str =
+        arg.attr("__class__").attr("__name__").cast<std::string>();
+    if (ty_str == "NoneType") {
+      cache_key += "None";
+      continue;
+    }
+    std::string err_msg = "Received type '" + ty_str + "' for argument " +
+                          std::to_string(i) + "." +
+                          " Only int, float, bool, torch.Tensor, and "
+                          "triton.language.constexpr are supported.";
+    throw std::runtime_error(err_msg);
+  }
   params_size = (std::ptrdiff_t)(params_ptr - &params[0]);
 }
 
@@ -282,40 +290,42 @@ void init_triton_runtime(py::module &&m) {
 
   // wrap backend_t
   py::enum_<backend_t>(m, "backend")
-    .value("HOST", HOST)
-    .value("CUDA", CUDA)
-    .value("ROCM", ROCM)
-    .export_values();
+      .value("HOST", HOST)
+      .value("CUDA", CUDA)
+      .value("ROCM", ROCM)
+      .export_values();
 
   // enable peer-to-peer
   m.def("enable_peer_access", [](backend_t backend, uint64_t peer_ptr) {
-      if (backend != CUDA)
-        throw std::runtime_error("P2P only supported on CUDA devices!");
-      cu_enable_peer_access(peer_ptr);
-    }
-  );
+    if (backend != CUDA)
+      throw std::runtime_error("P2P only supported on CUDA devices!");
+    cu_enable_peer_access(peer_ptr);
+  });
 
   // get range size for the given pointer
   m.def("get_pointer_range_size", &get_pointer_range_size);
 
-
   // cache key
-  m.def("launch", [](py::list args, py::list do_not_specialize, const std::string& func_key, py::list& arg_names, 
-                     py::object device, py::int_ stream, py::dict bin_cache, py::int_ num_warps, py::int_ num_stages, 
-                     py::function add_to_cache, py::object grid){
-    // parse arguments to compute cache key, compile-time constants and packed kernel arguments
+  m.def("launch", [](py::list args, py::list do_not_specialize,
+                     const std::string &func_key, py::list &arg_names,
+                     py::object device, py::int_ stream, py::dict bin_cache,
+                     py::int_ num_warps, py::int_ num_stages,
+                     py::function add_to_cache, py::object grid) {
+    // parse arguments to compute cache key, compile-time constants and packed
+    // kernel arguments
     long _num_warps = PyLong_AsLong(num_warps.ptr());
     long _num_stages = PyLong_AsLong(num_stages.ptr());
     std::string cache_key;
     std::string params;
     size_t params_size;
     py::dict constants;
-    parse_args(args, do_not_specialize, func_key, arg_names, cache_key, params, params_size, constants, _num_warps, _num_stages);
+    parse_args(args, do_not_specialize, func_key, arg_names, cache_key, params,
+               params_size, constants, _num_warps, _num_stages);
 
     // get cached binary
     py::str key(cache_key);
     py::bool_ noop = false;
-    if(!bin_cache.contains(key)) {
+    if (!bin_cache.contains(key)) {
       noop = add_to_cache(key, args, device, num_warps, num_stages);
     }
     if (noop)
@@ -324,7 +334,7 @@ void init_triton_runtime(py::module &&m) {
 
     // get grid
     py::sequence seq;
-    if(!PySequence_Check(grid.ptr()))
+    if (!PySequence_Check(grid.ptr()))
       seq = grid(constants);
     else
       seq = grid;
@@ -338,20 +348,18 @@ void init_triton_runtime(py::module &&m) {
     uint64_t shared_mem = py::cast<uint64_t>(bin.attr("shared_mem"));
 
     // actually launch
-    void *config[] = {
-        CU_LAUNCH_PARAM_BUFFER_POINTER, params.data(),
-        CU_LAUNCH_PARAM_BUFFER_SIZE, &params_size,
-        CU_LAUNCH_PARAM_END
-    };
+    void *config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, params.data(),
+                      CU_LAUNCH_PARAM_BUFFER_SIZE, &params_size,
+                      CU_LAUNCH_PARAM_END};
     uint64_t _stream = PyLong_AsLong(stream.ptr());
-    if(grid_0*grid_1*grid_2 > 0) {
+    if (grid_0 * grid_1 * grid_2 > 0) {
       // release the gil in case the enqueue blocks
       // cuda will block if too many ops are enqueued
       py::gil_scoped_release allow_threads;
-      drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2, 
-                                    _num_warps*32, 1, 1, shared_mem, (CUstream)_stream, 
-                                     nullptr, config);
-   }
+      drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2,
+                                    _num_warps * 32, 1, 1, shared_mem,
+                                    (CUstream)_stream, nullptr, config);
+    }
     return bin;
   });
 
@@ -360,66 +368,73 @@ void init_triton_runtime(py::module &&m) {
       CUdevice dev = (CUdevice)device;
       int major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
       int minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
-      return major*10 + minor;
+      return major * 10 + minor;
     }
     return -1;
   });
 
   // query maximum shared memory
   m.def("max_shared_memory", [](backend_t backend, uint64_t device) {
-      if (backend == HOST)
-        return 0;
-      if(backend == CUDA) 
-        return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>(device);
-      if(backend == ROCM)
-        return hipGetInfo<hipDeviceAttributeMaxSharedMemoryPerBlock>(device);
-      return -1;
+    if (backend == HOST)
+      return 0;
+    if (backend == CUDA)
+      return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>(
+          device);
+    if (backend == ROCM)
+      return hipGetInfo<hipDeviceAttributeMaxSharedMemoryPerBlock>(device);
+    return -1;
   });
 
   // query DRAM & L2 cache
   m.def("memory_clock_rate", [](backend_t backend, uint64_t device) {
-    if (backend == CUDA) return cuGetInfo<CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE>(device);
+    if (backend == CUDA)
+      return cuGetInfo<CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE>(device);
     return -1;
   });
   m.def("global_memory_bus_width", [](backend_t backend, uint64_t device) {
-    if (backend == CUDA) return cuGetInfo<CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH>(device);
+    if (backend == CUDA)
+      return cuGetInfo<CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH>(device);
     return -1;
   });
   m.def("l2_cache_size", [](backend_t backend, uint64_t device) {
-    if (backend == CUDA) return cuGetInfo<CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE>(device);
+    if (backend == CUDA)
+      return cuGetInfo<CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE>(device);
     return -1;
   });
 
   // query clock rate (in kilohertz)
   m.def("clock_rate", [](backend_t backend, uint64_t device) {
-    if (backend == CUDA) return cuGetInfo<CU_DEVICE_ATTRIBUTE_CLOCK_RATE>(device);
+    if (backend == CUDA)
+      return cuGetInfo<CU_DEVICE_ATTRIBUTE_CLOCK_RATE>(device);
     return -1;
   });
 
   m.def("num_sm", [](backend_t backend, uint64_t device) {
-    if (backend == CUDA) return cuGetInfo<CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT>(device);
+    if (backend == CUDA)
+      return cuGetInfo<CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT>(device);
     return -1;
   });
 
   // enqueue
-  m.def("enqueue", [](backend_t backend, uint64_t stream, uint64_t kernel,
-                      uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
-                      uint64_t block_0, uint64_t block_1, uint64_t block_2,
-                      const std::string &args, int64_t shared_mem){
-    void* args_ptr = (void*)args.data();
-    size_t args_size = args.size();
-    // release the gil in case the enqueue blocks
-    // cuda will block if too many ops are enqueued
-    py::gil_scoped_release allow_threads;
-    if(backend == HOST)
-      host_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
-    if(backend == CUDA)
-      cu_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
-    if(backend == ROCM)
-      hip_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
-  });
-
-  
+  m.def("enqueue",
+        [](backend_t backend, uint64_t stream, uint64_t kernel, uint64_t grid_0,
+           uint64_t grid_1, uint64_t grid_2, uint64_t block_0, uint64_t block_1,
+           uint64_t block_2, const std::string &args, int64_t shared_mem) {
+          void *args_ptr = (void *)args.data();
+          size_t args_size = args.size();
+          // release the gil in case the enqueue blocks
+          // cuda will block if too many ops are enqueued
+          py::gil_scoped_release allow_threads;
+          if (backend == HOST)
+            host_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0,
+                         block_1, block_2, args_ptr, args_size, shared_mem);
+          if (backend == CUDA)
+            cu_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1,
+                       block_2, args_ptr, args_size, shared_mem);
+          if (backend == ROCM)
+            hip_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0,
+                        block_1, block_2, args_ptr, args_size, shared_mem);
+        });
 }
 
 /*****************************************************************************/
@@ -427,15 +442,18 @@ void init_triton_runtime(py::module &&m) {
 /*****************************************************************************/
 typedef std::map<std::string, py::object> asm_map_t;
 
-// --------------------------------------- 
+// ---------------------------------------
 // Load provided assembly code into driver
-// --------------------------------------- 
+// ---------------------------------------
 
 // CUDA
-std::tuple<uint64_t, uint64_t> cu_load_binary(const std::string& name, asm_map_t &asm_map, size_t n_shared_bytes, uint64_t dev){
+std::tuple<uint64_t, uint64_t> cu_load_binary(const std::string &name,
+                                              asm_map_t &asm_map,
+                                              size_t n_shared_bytes,
+                                              uint64_t dev) {
   // load assembly
   std::string assembly;
-  if(asm_map.find("cubin") != asm_map.end())
+  if (asm_map.find("cubin") != asm_map.end())
     assembly = py::cast<std::string>(asm_map["cubin"]);
   else
     assembly = py::cast<std::string>(asm_map["ptx"]);
@@ -446,22 +464,33 @@ std::tuple<uint64_t, uint64_t> cu_load_binary(const std::string& name, asm_map_t
   drv::dispatch::cuModuleGetFunction(&fun, mod, name.c_str());
   // set dynamic shared memory if necessary
   int shared_optin;
-  drv::dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
-  if(n_shared_bytes > 49152 && shared_optin > 49152){
+  drv::dispatch::cuDeviceGetAttribute(
+      &shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+      dev);
+  if (n_shared_bytes > 49152 && shared_optin > 49152) {
     drv::dispatch::cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED);
     int shared_total, shared_static;
     int n_spills, n_reg;
-    drv::dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
-    drv::dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun);
-    drv::dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,  fun);
+    drv::dispatch::cuDeviceGetAttribute(
+        &shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
+        dev);
+    drv::dispatch::cuFuncGetAttribute(&shared_static,
+                                      CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun);
+    drv::dispatch::cuFuncGetAttribute(&n_spills,
+                                      CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun);
     drv::dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, fun);
-    drv::dispatch::cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
+    drv::dispatch::cuFuncSetAttribute(
+        fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+        shared_optin - shared_static);
   }
   return std::make_tuple((uint64_t)mod, (uint64_t)fun);
 }
 
 // ROCM
-std::tuple<uint64_t, uint64_t> hip_load_binary(const std::string& name, asm_map_t &asm_map, size_t n_shared_bytes, uint64_t dev){
+std::tuple<uint64_t, uint64_t> hip_load_binary(const std::string &name,
+                                               asm_map_t &asm_map,
+                                               size_t n_shared_bytes,
+                                               uint64_t dev) {
   py::bytes _assembly = asm_map["hsaco"];
   std::string assembly = py::cast<std::string>(_assembly);
   // HSA-CO -> hipModule
@@ -473,29 +502,34 @@ std::tuple<uint64_t, uint64_t> hip_load_binary(const std::string& name, asm_map_
   return std::make_tuple((uint64_t)mod, (uint64_t)fun);
 }
 
-// --------------------------------------- 
+// ---------------------------------------
 // Compile Triton-IR to assembly
-// --------------------------------------- 
+// ---------------------------------------
 
 // // CUDA
-// std::tuple<std::string, asm_map_t, int> cu_compile_ttir(const std::string& name, ir::module &ir, 
-//                                                                uint64_t device, int num_warps, int num_stages,
-//                                                                asm_map_t &asm_map){
+// std::tuple<std::string, asm_map_t, int> cu_compile_ttir(const std::string&
+// name, ir::module &ir,
+//                                                                uint64_t
+//                                                                device, int
+//                                                                num_warps, int
+//                                                                num_stages,
+//                                                                asm_map_t
+//                                                                &asm_map){
 
 //   int n_shared_bytes;
 //   py::gil_scoped_release allow_threads;
 //   llvm::LLVMContext ctx;
 //   // device properties
 //   CUdevice dev = (CUdevice)device;
-//   size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
-//   size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
-//   size_t cc = major*10 + minor;
-//   int version;
-//   std::string ptxas_path = drv::path_to_ptxas(version);
+//   size_t major =
+//   cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev); size_t minor
+//   = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev); size_t cc =
+//   major*10 + minor; int version; std::string ptxas_path =
+//   drv::path_to_ptxas(version);
 //   // Triton-IR -> NVPTX LLVM-IR
 //   triton::codegen::nvidia_cu_target target(cc);
-//   auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, cc, num_warps, num_stages, n_shared_bytes);
-//   std::string tmp;
+//   auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, cc,
+//   num_warps, num_stages, n_shared_bytes); std::string tmp;
 //   llvm::raw_string_ostream llir(tmp);
 //   llir << *llvm;
 //   llir.flush();
@@ -513,15 +547,21 @@ std::tuple<uint64_t, uint64_t> hip_load_binary(const std::string& name, asm_map_
 // }
 
 // // HIP
-// std::tuple<std::string, asm_map_t, int> hip_compile_ttir(const std::string& name, ir::module &ir, 
-//                                                                 uint64_t device, int num_warps, int num_stages, 
-//                                                                 asm_map_t &asm_map){
+// std::tuple<std::string, asm_map_t, int> hip_compile_ttir(const std::string&
+// name, ir::module &ir,
+//                                                                 uint64_t
+//                                                                 device, int
+//                                                                 num_warps,
+//                                                                 int
+//                                                                 num_stages,
+//                                                                 asm_map_t
+//                                                                 &asm_map){
 //   llvm::LLVMContext ctx;
 //   // Triton-IR -> NVPTX LLVM-IR
 //   triton::codegen::amd_cl_target target;
 //   int n_shared_bytes;
-//   auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, 70, num_warps, num_stages, n_shared_bytes);
-//   std::string tmp;
+//   auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, 70,
+//   num_warps, num_stages, n_shared_bytes); std::string tmp;
 //   llvm::raw_string_ostream llir(tmp);
 //   llir << *llvm;
 //   llir.flush();
@@ -534,7 +574,8 @@ std::tuple<uint64_t, uint64_t> hip_load_binary(const std::string& name, asm_map_
 
 // void init_triton_codegen(py::module &&m) {
 //   m.def(
-//       "compile_ttir", [](backend_t backend, ir::module &ir, uint64_t device, int num_warps, int num_stages) {
+//       "compile_ttir", [](backend_t backend, ir::module &ir, uint64_t device,
+//       int num_warps, int num_stages) {
 //         std::string name = ir.get_function_list()[0]->get_name();
 //         // record asm as we generate
 //         asm_map_t asm_map;
@@ -543,11 +584,14 @@ std::tuple<uint64_t, uint64_t> hip_load_binary(const std::string& name, asm_map_
 //         asm_map["ttir"] = py::cast(ttir.str());
 //         llvm::LLVMContext ctx;
 //         if(backend == CUDA)
-//           return cu_compile_ttir(name, ir, device, num_warps, num_stages, asm_map);
+//           return cu_compile_ttir(name, ir, device, num_warps, num_stages,
+//           asm_map);
 //         if(backend == ROCM)
-//           return hip_compile_ttir(name, ir, device, num_warps, num_stages, asm_map);
+//           return hip_compile_ttir(name, ir, device, num_warps, num_stages,
+//           asm_map);
 //       }, py::return_value_policy::take_ownership);
-//   m.def("load_binary", [](backend_t backend, const std::string& name, asm_map_t &asm_map, size_t n_shared_bytes, uint64_t dev){
+//   m.def("load_binary", [](backend_t backend, const std::string& name,
+//   asm_map_t &asm_map, size_t n_shared_bytes, uint64_t dev){
 // 	py::gil_scoped_release allow_threads;
 //         if(backend == CUDA)
 //           return cu_load_binary(name, asm_map, n_shared_bytes, dev);
@@ -556,7 +600,6 @@ std::tuple<uint64_t, uint64_t> hip_load_binary(const std::string& name, asm_map_
 //       }, py::return_value_policy::take_ownership);
 // }
 
-
 /*****************************************************************************/
 /* Python bindings for triton::ir                                            */
 /*****************************************************************************/
@@ -570,13 +613,13 @@ void init_triton_ir(py::module &&m) {
       .value("CA", mlir::triton::CacheModifier::CA)
       .value("CG", mlir::triton::CacheModifier::CG)
       .export_values();
-  
+
   py::enum_<mlir::triton::EvictionPolicy>(m, "EVICTION_POLICY")
       .value("NORMAL", mlir::triton::EvictionPolicy::NORMAL)
       .value("EVICT_FIRST", mlir::triton::EvictionPolicy::EVICT_FIRST)
       .value("EVICT_LAST", mlir::triton::EvictionPolicy::EVICT_LAST)
       .export_values();
-  
+
   py::enum_<mlir::triton::RedOp>(m, "REDUCE_OP")
       .value("ADD", mlir::triton::RedOp::ADD)
       .value("FADD", mlir::triton::RedOp::FADD)
@@ -585,7 +628,7 @@ void init_triton_ir(py::module &&m) {
       .value("FMIN", mlir::triton::RedOp::FMIN)
       .value("FMAX", mlir::triton::RedOp::FMAX)
       .value("XOR", mlir::triton::RedOp::XOR);
-  
+
   py::enum_<mlir::triton::RMWOp>(m, "ATOMIC_OP")
       .value("ADD", mlir::triton::RMWOp::ADD)
       .value("FADD", mlir::triton::RMWOp::FADD)
@@ -603,12 +646,12 @@ void init_triton_ir(py::module &&m) {
       .def("load_triton", [](mlir::MLIRContext &self) {
         self.getOrLoadDialect<mlir::triton::TritonDialect>();
       });
-      // .def(py::init([](){
-      //   mlir::MLIRContext context;
-      //   context.getOrLoadDialect<mlir::triton.TritonDialect>();
-      //   // TODO: should we return a (raw/unique) pointer here?
-      //   return context;
-      // }));
+  // .def(py::init([](){
+  //   mlir::MLIRContext context;
+  //   context.getOrLoadDialect<mlir::triton.TritonDialect>();
+  //   // TODO: should we return a (raw/unique) pointer here?
+  //   return context;
+  // }));
 
   // py::class_<ir::value>(m, "value")
   //     .def("multiple_of", [](ir::value *self, int val) {
@@ -650,48 +693,48 @@ void init_triton_ir(py::module &&m) {
 
   py::class_<mlir::Type>(m, "type")
       .def("is_integer", &mlir::Type::isInteger)
-      .def("is_fp16", &mlir::Type::isF16)
-      ;
+      .def("is_fp16", &mlir::Type::isF16);
 
   py::class_<mlir::Value>(m, "value")
-      .def("set_attr", [](mlir::Value &self, std::string &name, mlir::Attribute &attr) -> void {
-        if (mlir::Operation *definingOp = self.getDefiningOp())
-          definingOp->setAttr(name, attr);
-        else {
-          /* issue an warning */
-        }
-      })
-      ;
-  py::class_<mlir::BlockArgument, mlir::Value>(m, "block_arguement")
-      ;
+      .def("set_attr",
+           [](mlir::Value &self, std::string &name,
+              mlir::Attribute &attr) -> void {
+             if (mlir::Operation *definingOp = self.getDefiningOp())
+               definingOp->setAttr(name, attr);
+             else {
+               /* issue an warning */
+             }
+           });
+  py::class_<mlir::BlockArgument, mlir::Value>(m, "block_arguement");
 
   py::class_<mlir::Region>(m, "region")
       .def("get_parent_region", &mlir::Region::getParentRegion, ret::reference)
-      .def("size", [](mlir::Region &self) {
-        return self.getBlocks().size();
-      })
-      .def("empty", &mlir::Region::empty)
-      ;
+      .def("size", [](mlir::Region &self) { return self.getBlocks().size(); })
+      .def("empty", &mlir::Region::empty);
 
   py::class_<mlir::Block>(m, "block")
-      .def("arg", [](mlir::Block &self, int index) -> mlir::BlockArgument {
-        return self.getArgument(index);
-      })
+      .def("arg",
+           [](mlir::Block &self, int index) -> mlir::BlockArgument {
+             return self.getArgument(index);
+           })
       .def("get_num_arguments", &mlir::Block::getNumArguments)
       .def("dump", &mlir::Block::dump)
       .def("move_before", &mlir::Block::moveBefore)
       .def("insert_before", &mlir::Block::insertBefore)
       .def("get_parent", &mlir::Block::getParent, ret::reference)
-      .def("merge_block_before", [](mlir::Block &self, mlir::Block &dst) {
-        // ref: RewriterBase::mergeBlocks()
-        if (self.getNumArguments() != 0)
-          throw std::runtime_error("This block has arguments, don't merge");
-        dst.getOperations().splice(dst.end(), self.getOperations());
-        self.dropAllUses();
-        self.erase();
-      })
-      .def("replace_use_in_block_with", [](mlir::Block &self, mlir::Value &v, mlir::Value &newVal) {
-        v.replaceUsesWithIf(newVal, [&](mlir::OpOperand &operand){
+      .def("merge_block_before",
+           [](mlir::Block &self, mlir::Block &dst) {
+             // ref: RewriterBase::mergeBlocks()
+             if (self.getNumArguments() != 0)
+               throw std::runtime_error(
+                   "This block has arguments, don't merge");
+             dst.getOperations().splice(dst.end(), self.getOperations());
+             self.dropAllUses();
+             self.erase();
+           })
+      .def("replace_use_in_block_with", [](mlir::Block &self, mlir::Value &v,
+                                           mlir::Value &newVal) {
+        v.replaceUsesWithIf(newVal, [&](mlir::OpOperand &operand) {
           mlir::Operation *user = operand.getOwner();
           mlir::Block *currentBlock = user->getBlock();
           while (currentBlock) {
@@ -702,8 +745,7 @@ void init_triton_ir(py::module &&m) {
           }
           return false;
         });
-      })
-      ;
+      });
 
   // using eattr = ir::attribute_kind_t;
   // py::enum_<eattr>(m, "attribute_kind")
@@ -721,124 +763,153 @@ void init_triton_ir(py::module &&m) {
 
   // Ops
   py::class_<mlir::OpState>(m, "OpState")
-      .def("set_attr", [](mlir::OpState &self, std::string &name, mlir::Attribute &attr) -> void {
-        self->setAttr(name, attr);
-      })
-      .def("get_num_results", [](mlir::OpState &self) -> unsigned {
-        return self->getNumResults();
-      })
-      .def("get_result", [](mlir::OpState &self, unsigned idx) -> mlir::Value {
-        return self->getResult(idx);
-      })
-      .def("get_region", [](mlir::OpState &self, unsigned idx) -> mlir::Region& {
-        return self->getRegion(idx);
-      }, ret::reference)
-      .def("get_body", [](mlir::scf::ForOp &self, unsigned idx) -> mlir::Block* {
-        return self.getBody(idx);
-      }, ret::reference)
+      .def("set_attr",
+           [](mlir::OpState &self, std::string &name,
+              mlir::Attribute &attr) -> void { self->setAttr(name, attr); })
+      .def(
+          "get_num_results",
+          [](mlir::OpState &self) -> unsigned { return self->getNumResults(); })
+      .def("get_result",
+           [](mlir::OpState &self, unsigned idx) -> mlir::Value {
+             return self->getResult(idx);
+           })
+      .def(
+          "get_region",
+          [](mlir::OpState &self, unsigned idx) -> mlir::Region & {
+            return self->getRegion(idx);
+          },
+          ret::reference)
+      .def(
+          "get_body",
+          [](mlir::scf::ForOp &self, unsigned idx) -> mlir::Block * {
+            return self.getBody(idx);
+          },
+          ret::reference)
       .def("dump", [](mlir::OpState &self) { self->dump(); })
-      .def("str", [](mlir::OpState &self) -> std::string {
-        std::string str;
-        llvm::raw_string_ostream os(str);
-        self->print(os);
-        return str;
-      })
-      .def("append_operand", [](mlir::OpState &self, mlir::Value &val) {
-        self->insertOperands(self->getNumOperands(), val);
-      })
+      .def("str",
+           [](mlir::OpState &self) -> std::string {
+             std::string str;
+             llvm::raw_string_ostream os(str);
+             self->print(os);
+             return str;
+           })
+      .def("append_operand",
+           [](mlir::OpState &self, mlir::Value &val) {
+             self->insertOperands(self->getNumOperands(), val);
+           })
       .def("verify", [](mlir::OpState &self) -> bool {
         return mlir::succeeded(mlir::verify(self.getOperation()));
-      })
-      ;
+      });
   // scf Ops
   py::class_<mlir::scf::ForOp, mlir::OpState>(m, "ForOp");
   py::class_<mlir::scf::IfOp, mlir::OpState>(m, "IfOp")
       .def("get_then_block", &mlir::scf::IfOp::thenBlock, ret::reference)
       .def("get_else_block", &mlir::scf::IfOp::elseBlock, ret::reference)
       .def("get_then_yield", &mlir::scf::IfOp::thenYield)
-      .def("get_else_yield", &mlir::scf::IfOp::elseYield)
-      ;
+      .def("get_else_yield", &mlir::scf::IfOp::elseYield);
   py::class_<mlir::scf::YieldOp, mlir::OpState>(m, "YieldOp");
   py::class_<mlir::scf::WhileOp, mlir::OpState>(m, "WhileOp")
       .def("get_before", &mlir::scf::WhileOp::getBefore, ret::reference)
       .def("get_after", &mlir::scf::WhileOp::getAfter, ret::reference);
   py::class_<mlir::scf::ConditionOp, mlir::OpState>(m, "CondtionOp");
 
-  // dynamic_attr is used to transfer ownership of the MLIR context to the module
+  // dynamic_attr is used to transfer ownership of the MLIR context to the
+  // module
   py::class_<mlir::ModuleOp, mlir::OpState>(m, "module", py::dynamic_attr())
       .def("dump", &mlir::ModuleOp::dump)
-      .def("str", [](mlir::ModuleOp &self) -> std::string {
-        std::string str;
-        llvm::raw_string_ostream os(str);
-        self.print(os);
-        return str;
-      })
-      .def("push_back", [](mlir::ModuleOp &self, mlir::FuncOp &funcOp) -> void {
-        self.push_back(funcOp);
-      })
-      .def("has_function", [](mlir::ModuleOp &self, std::string &funcName) -> bool {
-        if (self.lookupSymbol(funcName))
-          return true;
-        return false;
-      })
-      .def("get_function", [](mlir::ModuleOp &self, std::string &funcName) -> mlir::FuncOp {
-        return self.lookupSymbol<mlir::FuncOp>(funcName);
-      })
-    ;
+      .def("str",
+           [](mlir::ModuleOp &self) -> std::string {
+             std::string str;
+             llvm::raw_string_ostream os(str);
+             self.print(os);
+             return str;
+           })
+      .def("push_back",
+           [](mlir::ModuleOp &self, mlir::FuncOp &funcOp) -> void {
+             self.push_back(funcOp);
+           })
+      .def("has_function",
+           [](mlir::ModuleOp &self, std::string &funcName) -> bool {
+             if (self.lookupSymbol(funcName))
+               return true;
+             return false;
+           })
+      .def("get_function",
+           [](mlir::ModuleOp &self, std::string &funcName) -> mlir::FuncOp {
+             return self.lookupSymbol<mlir::FuncOp>(funcName);
+           });
 
   py::class_<mlir::FuncOp, mlir::OpState>(m, "function")
-    // .def_property_readonly("attrs", &ir::function::attrs)
-    // .def("add_attr", &ir::function::add_attr);
-    .def("args", [](mlir::FuncOp &self, unsigned idx) -> mlir::BlockArgument {
-      return self.getArgument(idx);
-    })
-    .def("add_entry_block", [](mlir::FuncOp &self) -> mlir::Block* {
-      return self.addEntryBlock();
-    }, ret::reference)
-    .def("set_arg_attr", [](mlir::FuncOp &self, int arg_no, const std::string& name, int val){
-      // set arg attributes "name" to value "val"
-      auto attrTy = mlir::IntegerType::get(self.getContext(), 32);
-      self.setArgAttr(arg_no, name, mlir::IntegerAttr::get(attrTy, val));
-    }, ret::reference)
-    .def("reset_type", &mlir::FuncOp::setType)
-    ;
+      // .def_property_readonly("attrs", &ir::function::attrs)
+      // .def("add_attr", &ir::function::add_attr);
+      .def("args",
+           [](mlir::FuncOp &self, unsigned idx) -> mlir::BlockArgument {
+             return self.getArgument(idx);
+           })
+      .def(
+          "add_entry_block",
+          [](mlir::FuncOp &self) -> mlir::Block * {
+            return self.addEntryBlock();
+          },
+          ret::reference)
+      .def(
+          "set_arg_attr",
+          [](mlir::FuncOp &self, int arg_no, const std::string &name, int val) {
+            // set arg attributes "name" to value "val"
+            auto attrTy = mlir::IntegerType::get(self.getContext(), 32);
+            self.setArgAttr(arg_no, name, mlir::IntegerAttr::get(attrTy, val));
+          },
+          ret::reference)
+      .def("reset_type", &mlir::FuncOp::setType);
 
   py::class_<mlir::OpBuilder::InsertPoint>(m, "InsertPoint");
 
   py::class_<mlir::OpBuilder>(m, "builder", py::dynamic_attr())
       .def(py::init<mlir::MLIRContext *>())
       // // getters
-      .def_property_readonly("context", &mlir::OpBuilder::getContext, ret::reference)
-      .def("create_module", [](mlir::OpBuilder &self) -> mlir::ModuleOp {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::ModuleOp>(loc);
-      })
-      .def("ret", [](mlir::OpBuilder &self, std::vector<mlir::Value> &vals) -> void {
-        auto loc = self.getUnknownLoc();
-        self.create<mlir::ReturnOp>(loc, vals);
-      })
-      .def("call", [](mlir::OpBuilder &self, mlir::FuncOp &func, std::vector<mlir::Value> &args) -> mlir::OpState {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::CallOp>(loc, func, args);
-      })
+      .def_property_readonly("context", &mlir::OpBuilder::getContext,
+                             ret::reference)
+      .def("create_module",
+           [](mlir::OpBuilder &self) -> mlir::ModuleOp {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::ModuleOp>(loc);
+           })
+      .def("ret",
+           [](mlir::OpBuilder &self, std::vector<mlir::Value> &vals) -> void {
+             auto loc = self.getUnknownLoc();
+             self.create<mlir::ReturnOp>(loc, vals);
+           })
+      .def("call",
+           [](mlir::OpBuilder &self, mlir::FuncOp &func,
+              std::vector<mlir::Value> &args) -> mlir::OpState {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::CallOp>(loc, func, args);
+           })
       // insertion block/point
-      .def("set_insertion_point_to_start", [](mlir::OpBuilder &self, mlir::Block &block) -> void {
-        self.setInsertionPointToStart(&block);
-      })
-      .def("set_insertion_point_to_end", [](mlir::OpBuilder &self, mlir::Block &block) {
-        self.setInsertionPointToEnd(&block);
-      })
-      .def("get_insertion_block", [](mlir::OpBuilder &self) -> mlir::Block* {
-        return self.getInsertionBlock();
-      }, ret::reference)
+      .def("set_insertion_point_to_start",
+           [](mlir::OpBuilder &self, mlir::Block &block) -> void {
+             self.setInsertionPointToStart(&block);
+           })
+      .def("set_insertion_point_to_end",
+           [](mlir::OpBuilder &self, mlir::Block &block) {
+             self.setInsertionPointToEnd(&block);
+           })
+      .def(
+          "get_insertion_block",
+          [](mlir::OpBuilder &self) -> mlir::Block * {
+            return self.getInsertionBlock();
+          },
+          ret::reference)
       .def("get_insertion_point", &mlir::OpBuilder::saveInsertionPoint)
       .def("restore_insertion_point", &mlir::OpBuilder::restoreInsertionPoint)
-      // .def("set_insert_point", [](ir::builder *self, std::pair<ir::basic_block*, ir::instruction*> pt) {
+      // .def("set_insert_point", [](ir::builder *self,
+      // std::pair<ir::basic_block*, ir::instruction*> pt) {
       //   ir::basic_block *bb = pt.first;
       //   ir::instruction *instr = pt.second;
       //   if (instr) {
       //     if (bb != instr->get_parent())
-      //       throw std::runtime_error("invalid insertion point, instr not in bb");
+      //       throw std::runtime_error("invalid insertion point, instr not in
+      //       bb");
       //     self->set_insert_point(instr);
       //   } else {
       //     assert(bb);
@@ -851,536 +922,721 @@ void init_triton_ir(py::module &&m) {
       // Use arith.ConstantOp to create constants
       // // Constants
       // .def("get_int1", &ir::builder::get_int1, ret::reference)
-      .def("get_int32", [](mlir::OpBuilder &self, int64_t v) -> mlir::Value { 
-        auto loc = self.getUnknownLoc();
-        return mlir::Value(self.create<mlir::arith::ConstantIntOp>(
-          loc, v, self.getI32Type()
-        ));
-      })
+      .def("get_int32",
+           [](mlir::OpBuilder &self, int64_t v) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return mlir::Value(self.create<mlir::arith::ConstantIntOp>(
+                 loc, v, self.getI32Type()));
+           })
       // .def("get_uint32", &ir::builder::get_int32, ret::reference)
-      // .def("get_int64", [](ir::builder *self, int64_t v) { return self->get_int64((uint64_t)v); }, ret::reference)
-      // .def("get_uint64", &ir::builder::get_int64, ret::reference)
-      // .def("get_float16", &ir::builder::get_float16, ret::reference)
-      .def("get_float32", [](mlir::OpBuilder &self, float v) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::ConstantOp>(loc, self.getF32FloatAttr(v));
-      })
-      .def("get_null_value", [](mlir::OpBuilder &self, mlir::Type &type) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        if (type.isa<mlir::FloatType>())
-          return self.create<mlir::arith::ConstantOp>(loc, self.getF32FloatAttr(0.0));
-        else
-          throw std::runtime_error("Not implemented");
-      })
+      // .def("get_int64", [](ir::builder *self, int64_t v) { return
+      // self->get_int64((uint64_t)v); }, ret::reference) .def("get_uint64",
+      // &ir::builder::get_int64, ret::reference) .def("get_float16",
+      // &ir::builder::get_float16, ret::reference)
+      .def("get_float32",
+           [](mlir::OpBuilder &self, float v) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::ConstantOp>(
+                 loc, self.getF32FloatAttr(v));
+           })
+      .def("get_null_value",
+           [](mlir::OpBuilder &self, mlir::Type &type) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             if (type.isa<mlir::FloatType>())
+               return self.create<mlir::arith::ConstantOp>(
+                   loc, self.getF32FloatAttr(0.0));
+             else
+               throw std::runtime_error("Not implemented");
+           })
 
       // Types
-      .def("get_void_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getNoneType();
-      })
-      .def("get_int1_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getI1Type();
-      }) // or ret::copy?
-      .def("get_int8_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getI8Type();
-      })
-      .def("get_int16_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getType<mlir::IntegerType>(16);
-      })
-      .def("get_int32_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getI32Type();
-      })
-      .def("get_int64_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getI64Type();
-      })
-      .def("get_fp8_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getType<mlir::triton::Float8Type>();
-      })
-      .def("get_bf8_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getType<mlir::triton::BFloat8Type>();
-      })
-      .def("get_half_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getF16Type();
-      })
-      .def("get_bf16_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getBF16Type();
-      })
-      .def("get_float_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getF32Type();
-      })
-      .def("get_double_ty", [](mlir::OpBuilder &self) -> mlir::Type {
-        return self.getF64Type();
-      })
-      .def("get_ptr_ty", [](mlir::OpBuilder &self, mlir::Type &type, int addrSpace) -> mlir::Type {
-        return mlir::triton::PointerType::get(type, addrSpace);
-      })
-      .def("get_block_ty", [](mlir::OpBuilder &self, mlir::Type &elementType,
-                              std::vector<int64_t> &shape) -> mlir::Type {
-        return mlir::RankedTensorType::get(shape, elementType);
-      })
-      .def("get_function_ty", [](mlir::OpBuilder &self,
-                                 std::vector<mlir::Type> inTypes,
-                                 std::vector<mlir::Type> outTypes) -> mlir::Type {
-        return self.getFunctionType(inTypes, outTypes);
-      })
+      .def("get_void_ty",
+           [](mlir::OpBuilder &self) -> mlir::Type {
+             return self.getNoneType();
+           })
+      .def("get_int1_ty",
+           [](mlir::OpBuilder &self) -> mlir::Type {
+             return self.getI1Type();
+           }) // or ret::copy?
+      .def("get_int8_ty",
+           [](mlir::OpBuilder &self) -> mlir::Type { return self.getI8Type(); })
+      .def("get_int16_ty",
+           [](mlir::OpBuilder &self) -> mlir::Type {
+             return self.getType<mlir::IntegerType>(16);
+           })
+      .def(
+          "get_int32_ty",
+          [](mlir::OpBuilder &self) -> mlir::Type { return self.getI32Type(); })
+      .def(
+          "get_int64_ty",
+          [](mlir::OpBuilder &self) -> mlir::Type { return self.getI64Type(); })
+      .def("get_fp8_ty",
+           [](mlir::OpBuilder &self) -> mlir::Type {
+             return self.getType<mlir::triton::Float8Type>();
+           })
+      .def("get_bf8_ty",
+           [](mlir::OpBuilder &self) -> mlir::Type {
+             return self.getType<mlir::triton::BFloat8Type>();
+           })
+      .def(
+          "get_half_ty",
+          [](mlir::OpBuilder &self) -> mlir::Type { return self.getF16Type(); })
+      .def("get_bf16_ty",
+           [](mlir::OpBuilder &self) -> mlir::Type {
+             return self.getBF16Type();
+           })
+      .def(
+          "get_float_ty",
+          [](mlir::OpBuilder &self) -> mlir::Type { return self.getF32Type(); })
+      .def(
+          "get_double_ty",
+          [](mlir::OpBuilder &self) -> mlir::Type { return self.getF64Type(); })
+      .def("get_ptr_ty",
+           [](mlir::OpBuilder &self, mlir::Type &type,
+              int addrSpace) -> mlir::Type {
+             return mlir::triton::PointerType::get(type, addrSpace);
+           })
+      .def("get_block_ty",
+           [](mlir::OpBuilder &self, mlir::Type &elementType,
+              std::vector<int64_t> &shape) -> mlir::Type {
+             return mlir::RankedTensorType::get(shape, elementType);
+           })
+      .def("get_function_ty",
+           [](mlir::OpBuilder &self, std::vector<mlir::Type> inTypes,
+              std::vector<mlir::Type> outTypes) -> mlir::Type {
+             return self.getFunctionType(inTypes, outTypes);
+           })
 
       // Ops
-      .def("create_function", [](mlir::OpBuilder &self, std::string name, mlir::Type &funcType) -> mlir::FuncOp {
-        // TODO: loc
-        auto loc = self.getUnknownLoc();
-        if (auto funcTy = funcType.dyn_cast<mlir::FunctionType>()) {
-          return self.create<mlir::FuncOp>(loc, name, funcTy);
-        }
-        throw std::runtime_error("invalid function type");
-      })
-      .def("get_or_insert_function", [](mlir::OpBuilder &self, mlir::ModuleOp &module,
-                                        std::string &funcName, mlir::Type &funcType) -> mlir::FuncOp {
-        if (mlir::Operation *funcOperation = module.lookupSymbol(funcName))
-          return llvm::dyn_cast<mlir::FuncOp>(funcOperation);
-        auto loc = self.getUnknownLoc();
-        if (auto funcTy = funcType.dyn_cast<mlir::FunctionType>()) {
-          return self.create<mlir::FuncOp>(loc, funcName, funcTy);
-        }
-        throw std::runtime_error("invalid function type");
-      })
-      .def("create_block", [](mlir::OpBuilder &self) -> mlir::Block* {
-        mlir::Region *parent = self.getBlock()->getParent();
-        return self.createBlock(parent);
-      }, ret::reference)
-      .def("create_block_with_parent", [](mlir::OpBuilder &self, mlir::Region &parent, 
-                                          std::vector<mlir::Type> &argTypes) -> mlir::Block* {
-        auto argLoc = self.getUnknownLoc();
-        llvm::SmallVector<mlir::Location, 8> argLocs(argTypes.size(), argLoc);
-        return self.createBlock(&parent, {}, argTypes, argLocs);
-      }, ret::reference)
-      .def("new_block", [](mlir::OpBuilder &self) -> mlir::Block* {
-        return new mlir::Block();
-      }, ret::reference)
+      .def("create_function",
+           [](mlir::OpBuilder &self, std::string name,
+              mlir::Type &funcType) -> mlir::FuncOp {
+             // TODO: loc
+             auto loc = self.getUnknownLoc();
+             if (auto funcTy = funcType.dyn_cast<mlir::FunctionType>()) {
+               return self.create<mlir::FuncOp>(loc, name, funcTy);
+             }
+             throw std::runtime_error("invalid function type");
+           })
+      .def("get_or_insert_function",
+           [](mlir::OpBuilder &self, mlir::ModuleOp &module,
+              std::string &funcName, mlir::Type &funcType) -> mlir::FuncOp {
+             if (mlir::Operation *funcOperation = module.lookupSymbol(funcName))
+               return llvm::dyn_cast<mlir::FuncOp>(funcOperation);
+             auto loc = self.getUnknownLoc();
+             if (auto funcTy = funcType.dyn_cast<mlir::FunctionType>()) {
+               return self.create<mlir::FuncOp>(loc, funcName, funcTy);
+             }
+             throw std::runtime_error("invalid function type");
+           })
+      .def(
+          "create_block",
+          [](mlir::OpBuilder &self) -> mlir::Block * {
+            mlir::Region *parent = self.getBlock()->getParent();
+            return self.createBlock(parent);
+          },
+          ret::reference)
+      .def(
+          "create_block_with_parent",
+          [](mlir::OpBuilder &self, mlir::Region &parent,
+             std::vector<mlir::Type> &argTypes) -> mlir::Block * {
+            auto argLoc = self.getUnknownLoc();
+            llvm::SmallVector<mlir::Location, 8> argLocs(argTypes.size(),
+                                                         argLoc);
+            return self.createBlock(&parent, {}, argTypes, argLocs);
+          },
+          ret::reference)
+      .def(
+          "new_block",
+          [](mlir::OpBuilder &self) -> mlir::Block * {
+            return new mlir::Block();
+          },
+          ret::reference)
       // Structured control flow
-      .def("create_for_op", [](mlir::OpBuilder &self, mlir::Value &lb, mlir::Value &ub,
-                               mlir::Value &step, std::vector<mlir::Value> &initArgs) -> mlir::scf::ForOp {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::scf::ForOp>(loc, lb, ub, step, initArgs);
-      })
-      .def("create_if_op", [](mlir::OpBuilder &self, std::vector<mlir::Type> &retTypes, mlir::Value &condition, bool withElse) -> mlir::scf::IfOp {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::scf::IfOp>(loc, retTypes, condition, withElse);
-      })
-      .def("create_yield_op", [](mlir::OpBuilder &self, std::vector<mlir::Value> &yields) -> mlir::scf::YieldOp {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::scf::YieldOp>(loc, yields);
-      })
-      .def("create_while_op", [](mlir::OpBuilder &self, std::vector<mlir::Type> &retTypes,
-                                 std::vector<mlir::Value> &initArgs) -> mlir::scf::WhileOp {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::scf::WhileOp>(loc, retTypes, initArgs);
-      })
-      .def("create_condtion_op", [](mlir::OpBuilder &self, mlir::Value &cond,
-                                    std::vector<mlir::Value> &args) -> mlir::scf::ConditionOp {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::scf::ConditionOp>(loc, cond, args);
-      })
+      .def("create_for_op",
+           [](mlir::OpBuilder &self, mlir::Value &lb, mlir::Value &ub,
+              mlir::Value &step,
+              std::vector<mlir::Value> &initArgs) -> mlir::scf::ForOp {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::scf::ForOp>(loc, lb, ub, step, initArgs);
+           })
+      .def("create_if_op",
+           [](mlir::OpBuilder &self, std::vector<mlir::Type> &retTypes,
+              mlir::Value &condition, bool withElse) -> mlir::scf::IfOp {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::scf::IfOp>(loc, retTypes, condition,
+                                                 withElse);
+           })
+      .def("create_yield_op",
+           [](mlir::OpBuilder &self,
+              std::vector<mlir::Value> &yields) -> mlir::scf::YieldOp {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::scf::YieldOp>(loc, yields);
+           })
+      .def("create_while_op",
+           [](mlir::OpBuilder &self, std::vector<mlir::Type> &retTypes,
+              std::vector<mlir::Value> &initArgs) -> mlir::scf::WhileOp {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::scf::WhileOp>(loc, retTypes, initArgs);
+           })
+      .def("create_condtion_op",
+           [](mlir::OpBuilder &self, mlir::Value &cond,
+              std::vector<mlir::Value> &args) -> mlir::scf::ConditionOp {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::scf::ConditionOp>(loc, cond, args);
+           })
 
       // miscellious
-      .def("create_make_range", [](mlir::OpBuilder &self, int start, int end) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        auto retType = mlir::RankedTensorType::get({end-start}, self.getI32Type());
-        return self.create<mlir::triton::MakeRangeOp>(loc, retType, start, end);
-      })
-      .def("create_get_program_id", [](mlir::OpBuilder &self, int axis) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::triton::GetProgramIdOp>(loc, self.getI32Type(), axis);
-      })
+      .def("create_make_range",
+           [](mlir::OpBuilder &self, int start, int end) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             auto retType =
+                 mlir::RankedTensorType::get({end - start}, self.getI32Type());
+             return self.create<mlir::triton::MakeRangeOp>(loc, retType, start,
+                                                           end);
+           })
+      .def("create_get_program_id",
+           [](mlir::OpBuilder &self, int axis) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::triton::GetProgramIdOp>(
+                 loc, self.getI32Type(), axis);
+           })
 
       // Cast instructions
-      .def("create_bitcast", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::BitcastOp>(loc, dstType, src);
-      })
+      .def("create_bitcast",
+           [](mlir::OpBuilder &self, mlir::Value &src,
+              mlir::Type &dstType) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::BitcastOp>(loc, dstType, src);
+           })
       // .def("create_cast", &ir::builder::create_cast)
       // .def("create_ptr_to_int", &ir::builder::create_ptr_to_int)
-      .def("create_si_to_fp", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::SIToFPOp>(loc, dstType, src);
-      })
-      .def("create_ui_to_fp", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::UIToFPOp>(loc, dstType, src);
-      })
-      .def("create_fp_to_si", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::FPToSIOp>(loc, dstType, src);
-      })
-      .def("create_fp_to_ui", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::FPToUIOp>(loc, dstType, src);
-      })
-      .def("create_fp_ext", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::ExtFOp>(loc, dstType, src);
-      })
-      .def("create_fp_trunc", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::TruncFOp>(loc, dstType, src);
-      })
+      .def("create_si_to_fp",
+           [](mlir::OpBuilder &self, mlir::Value &src,
+              mlir::Type &dstType) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::SIToFPOp>(loc, dstType, src);
+           })
+      .def("create_ui_to_fp",
+           [](mlir::OpBuilder &self, mlir::Value &src,
+              mlir::Type &dstType) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::UIToFPOp>(loc, dstType, src);
+           })
+      .def("create_fp_to_si",
+           [](mlir::OpBuilder &self, mlir::Value &src,
+              mlir::Type &dstType) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::FPToSIOp>(loc, dstType, src);
+           })
+      .def("create_fp_to_ui",
+           [](mlir::OpBuilder &self, mlir::Value &src,
+              mlir::Type &dstType) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::FPToUIOp>(loc, dstType, src);
+           })
+      .def("create_fp_ext",
+           [](mlir::OpBuilder &self, mlir::Value &src,
+              mlir::Type &dstType) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::ExtFOp>(loc, dstType, src);
+           })
+      .def("create_fp_trunc",
+           [](mlir::OpBuilder &self, mlir::Value &src,
+              mlir::Type &dstType) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::TruncFOp>(loc, dstType, src);
+           })
       // .def("create_int_cast", &ir::builder::create_int_cast)
       // .def("create_downcast", &ir::builder::create_downcast)
-      .def("create_to_index", [](mlir::OpBuilder &self, mlir::Value &input) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::IndexCastOp>(loc, input, self.getIndexType());
-      })
+      .def("create_to_index",
+           [](mlir::OpBuilder &self, mlir::Value &input) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::IndexCastOp>(loc, input,
+                                                          self.getIndexType());
+           })
 
-      .def("create_fmul", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::MulFOp>(loc, lhs, rhs);
-      })
-      .def("create_fdiv", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::DivFOp>(loc, lhs, rhs);
-      })
-      .def("create_frem", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::RemFOp>(loc, lhs, rhs);
-      })
-      .def("create_fadd", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::AddFOp>(loc, lhs, rhs);
-      })
-      .def("create_fsub", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::SubFOp>(loc, lhs, rhs);
-      })
-      .def("create_mul", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::MulIOp>(loc, lhs, rhs);
-      })
-      .def("create_sdiv", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::DivSIOp>(loc, lhs, rhs);
-      })
-      .def("create_udiv", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::DivUIOp>(loc, lhs, rhs);
-      })
-      .def("create_srem", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::RemSIOp>(loc, lhs, rhs);
-      })
-      .def("create_urem", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::RemUIOp>(loc, lhs, rhs);
-      })
-      .def("create_add", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::AddIOp>(loc, lhs, rhs);
-      })
-      .def("create_sub", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return mlir::Value(self.create<mlir::arith::SubIOp>(loc, lhs, rhs));
-      })
-      .def("create_shl", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return mlir::Value(self.create<mlir::arith::ShLIOp>(loc, lhs, rhs));
-      })
-      .def("create_lshr", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return mlir::Value(self.create<mlir::arith::ShRUIOp>(loc, lhs, rhs));
-      })
-      .def("create_ashr", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return mlir::Value(self.create<mlir::arith::ShRSIOp>(loc, lhs, rhs));
-      })
+      .def("create_fmul",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::MulFOp>(loc, lhs, rhs);
+           })
+      .def("create_fdiv",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::DivFOp>(loc, lhs, rhs);
+           })
+      .def("create_frem",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::RemFOp>(loc, lhs, rhs);
+           })
+      .def("create_fadd",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::AddFOp>(loc, lhs, rhs);
+           })
+      .def("create_fsub",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::SubFOp>(loc, lhs, rhs);
+           })
+      .def("create_mul",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::MulIOp>(loc, lhs, rhs);
+           })
+      .def("create_sdiv",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::DivSIOp>(loc, lhs, rhs);
+           })
+      .def("create_udiv",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::DivUIOp>(loc, lhs, rhs);
+           })
+      .def("create_srem",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::RemSIOp>(loc, lhs, rhs);
+           })
+      .def("create_urem",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::RemUIOp>(loc, lhs, rhs);
+           })
+      .def("create_add",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::AddIOp>(loc, lhs, rhs);
+           })
+      .def("create_sub",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return mlir::Value(
+                 self.create<mlir::arith::SubIOp>(loc, lhs, rhs));
+           })
+      .def("create_shl",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return mlir::Value(
+                 self.create<mlir::arith::ShLIOp>(loc, lhs, rhs));
+           })
+      .def("create_lshr",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return mlir::Value(
+                 self.create<mlir::arith::ShRUIOp>(loc, lhs, rhs));
+           })
+      .def("create_ashr",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return mlir::Value(
+                 self.create<mlir::arith::ShRSIOp>(loc, lhs, rhs));
+           })
       // GEP
-      .def("create_gep", [](mlir::OpBuilder &self, mlir::Value &ptr, mlir::Value &offset) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::triton::GEPOp>(loc, ptr.getType(), ptr, offset);
-      })
+      .def("create_gep",
+           [](mlir::OpBuilder &self, mlir::Value &ptr,
+              mlir::Value &offset) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::triton::GEPOp>(loc, ptr.getType(), ptr,
+                                                     offset);
+           })
       // Comparison (int)
-      .def("create_icmpSLE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::sle, lhs, rhs);
-      })
-      .def("create_icmpSLT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::slt, lhs, rhs);
-      })
-      .def("create_icmpSGE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::sge, lhs, rhs);
-      })
-      .def("create_icmpSGT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::sgt, lhs, rhs);
-      })
-      .def("create_icmpULE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::ule, lhs, rhs);
-      })
-      .def("create_icmpULT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::ult, lhs, rhs);
-      })
-      .def("create_icmpUGE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::uge, lhs, rhs);
-      })
-      .def("create_icmpUGT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::ugt, lhs, rhs);
-      })
-      .def("create_icmpEQ", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::eq, lhs, rhs);
-      })
-      .def("create_icmpNE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::ne, lhs, rhs);
-      })
+      .def("create_icmpSLE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpIOp>(
+                 loc, mlir::arith::CmpIPredicate::sle, lhs, rhs);
+           })
+      .def("create_icmpSLT",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpIOp>(
+                 loc, mlir::arith::CmpIPredicate::slt, lhs, rhs);
+           })
+      .def("create_icmpSGE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpIOp>(
+                 loc, mlir::arith::CmpIPredicate::sge, lhs, rhs);
+           })
+      .def("create_icmpSGT",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpIOp>(
+                 loc, mlir::arith::CmpIPredicate::sgt, lhs, rhs);
+           })
+      .def("create_icmpULE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpIOp>(
+                 loc, mlir::arith::CmpIPredicate::ule, lhs, rhs);
+           })
+      .def("create_icmpULT",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpIOp>(
+                 loc, mlir::arith::CmpIPredicate::ult, lhs, rhs);
+           })
+      .def("create_icmpUGE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpIOp>(
+                 loc, mlir::arith::CmpIPredicate::uge, lhs, rhs);
+           })
+      .def("create_icmpUGT",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpIOp>(
+                 loc, mlir::arith::CmpIPredicate::ugt, lhs, rhs);
+           })
+      .def("create_icmpEQ",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpIOp>(
+                 loc, mlir::arith::CmpIPredicate::eq, lhs, rhs);
+           })
+      .def("create_icmpNE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpIOp>(
+                 loc, mlir::arith::CmpIPredicate::ne, lhs, rhs);
+           })
       // Comparison (float)
-      .def("create_fcmpOLT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::OLT, lhs, rhs);
-      })
-      .def("create_fcmpOGT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::OGT, lhs, rhs);
-      })
-      .def("create_fcmpOLE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::OLE, lhs, rhs);
-      })
-      .def("create_fcmpOGE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::OGE, lhs, rhs);
-      })
-      .def("create_fcmpOEQ", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::OEQ, lhs, rhs);
-      })
-      .def("create_fcmpONE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::ONE, lhs, rhs);
-      })
-      .def("create_fcmpULT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::ULT, lhs, rhs);
-      })
-      .def("create_fcmpUGT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::UGT, lhs, rhs);
-      })
-      .def("create_fcmpULE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::ULE, lhs, rhs);
-      })
-      .def("create_fcmpUGE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::UGE, lhs, rhs);
-      })
-      .def("create_fcmpUEQ", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::UEQ, lhs, rhs);
-      })
-      .def("create_fcmpUNE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::UNE, lhs, rhs);
-      })
+      .def("create_fcmpOLT",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::OLT, lhs, rhs);
+           })
+      .def("create_fcmpOGT",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::OGT, lhs, rhs);
+           })
+      .def("create_fcmpOLE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::OLE, lhs, rhs);
+           })
+      .def("create_fcmpOGE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::OGE, lhs, rhs);
+           })
+      .def("create_fcmpOEQ",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::OEQ, lhs, rhs);
+           })
+      .def("create_fcmpONE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::ONE, lhs, rhs);
+           })
+      .def("create_fcmpULT",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::ULT, lhs, rhs);
+           })
+      .def("create_fcmpUGT",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::UGT, lhs, rhs);
+           })
+      .def("create_fcmpULE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::ULE, lhs, rhs);
+           })
+      .def("create_fcmpUGE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::UGE, lhs, rhs);
+           })
+      .def("create_fcmpUEQ",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::UEQ, lhs, rhs);
+           })
+      .def("create_fcmpUNE",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::CmpFOp>(
+                 loc, mlir::arith::CmpFPredicate::UNE, lhs, rhs);
+           })
       // // Logical
-      .def("create_and", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::AndIOp>(loc, lhs, rhs);
-      })
-      .def("create_xor", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::XOrIOp>(loc, lhs, rhs);
-      })
-      .def("create_or", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::arith::OrIOp>(loc, lhs, rhs);
-      })
+      .def("create_and",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::AndIOp>(loc, lhs, rhs);
+           })
+      .def("create_xor",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::XOrIOp>(loc, lhs, rhs);
+           })
+      .def("create_or",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::arith::OrIOp>(loc, lhs, rhs);
+           })
       // // Input/Output
-      .def("create_load", [](mlir::OpBuilder &self, mlir::Value &ptrs,
-                             mlir::triton::CacheModifier cacheModifer,
-                             mlir::triton::EvictionPolicy evictionPolicy,
-                             bool isVolatile) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::triton::LoadOp>(loc, ptrs, cacheModifer, evictionPolicy, isVolatile);
-      })
-      .def("create_store", [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &value) -> void {
-        auto loc = self.getUnknownLoc();
-        self.create<mlir::triton::StoreOp>(loc, ptrs, value);
-      })
-      .def("create_masked_load", [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &mask, mlir::Value &other,
-                                    mlir::triton::CacheModifier cacheModifier,
-                                    mlir::triton::EvictionPolicy evictionPolicy,
-                                    bool isVolatile) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        auto ptrType = ptrs.getType().dyn_cast<mlir::RankedTensorType>();
-        std::vector<int64_t> shape = ptrType.getShape();
-        mlir::Type elementType = ptrType.getElementType().dyn_cast<mlir::triton::PointerType>().getPointeeType();
-        return self.create<mlir::triton::LoadOp>(
-          loc, mlir::RankedTensorType::get(shape, elementType), ptrs, mask, other,
-          cacheModifier, evictionPolicy, isVolatile);
-      })
-      .def("create_masked_store", [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &val, mlir::Value &mask) -> void {
-        auto loc = self.getUnknownLoc();
-        self.create<mlir::triton::StoreOp>(loc, ptrs, val, mask);
-      })
+      .def("create_load",
+           [](mlir::OpBuilder &self, mlir::Value &ptrs,
+              mlir::triton::CacheModifier cacheModifer,
+              mlir::triton::EvictionPolicy evictionPolicy,
+              bool isVolatile) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::triton::LoadOp>(
+                 loc, ptrs, cacheModifer, evictionPolicy, isVolatile);
+           })
+      .def("create_store",
+           [](mlir::OpBuilder &self, mlir::Value &ptrs,
+              mlir::Value &value) -> void {
+             auto loc = self.getUnknownLoc();
+             self.create<mlir::triton::StoreOp>(loc, ptrs, value);
+           })
+      .def("create_masked_load",
+           [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &mask,
+              mlir::Value &other, mlir::triton::CacheModifier cacheModifier,
+              mlir::triton::EvictionPolicy evictionPolicy,
+              bool isVolatile) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             auto ptrType = ptrs.getType().dyn_cast<mlir::RankedTensorType>();
+             std::vector<int64_t> shape = ptrType.getShape();
+             mlir::Type elementType = ptrType.getElementType()
+                                          .dyn_cast<mlir::triton::PointerType>()
+                                          .getPointeeType();
+             return self.create<mlir::triton::LoadOp>(
+                 loc, mlir::RankedTensorType::get(shape, elementType), ptrs,
+                 mask, other, cacheModifier, evictionPolicy, isVolatile);
+           })
+      .def("create_masked_store",
+           [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &val,
+              mlir::Value &mask) -> void {
+             auto loc = self.getUnknownLoc();
+             self.create<mlir::triton::StoreOp>(loc, ptrs, val, mask);
+           })
       // Block instruction
-      .def("create_reshape", [](mlir::OpBuilder &self, mlir::Value &arg, std::vector<int64_t> &shape) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        auto argType = arg.getType().dyn_cast<mlir::RankedTensorType>().getElementType();
-        return self.create<mlir::triton::ReshapeOp>(
-          loc, mlir::RankedTensorType::get(shape, argType), arg
-        );
-      })
-      .def("create_cat", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        auto lhsType = lhs.getType().dyn_cast<mlir::RankedTensorType>();
-        auto rhsType = rhs.getType().dyn_cast<mlir::RankedTensorType>();
-        if (!(lhsType.getShape().size() == 1 && rhsType.getShape().size() == 1))
-          throw std::runtime_error("shape not supported by cat. Expecting rank-1 inputs");
-        std::vector<int64_t> shape {lhsType.getShape()[0] + rhsType.getShape()[0]};
-        return self.create<mlir::triton::CatOp>(
-          loc, mlir::RankedTensorType::get(shape, lhsType.getElementType()), lhs, rhs
-        );
-      })
-      .def("create_broadcast", [](mlir::OpBuilder &self, mlir::Value &arg, std::vector<int64_t> &shape) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        if (auto argType = arg.getType().dyn_cast<mlir::RankedTensorType>())
-          return self.createOrFold<mlir::triton::BroadcastOp>(
-            loc, mlir::RankedTensorType::get(shape, argType.getElementType()), arg
-          );
-        throw std::runtime_error("arg is not of RankedTensorType, use create_splat");
-      })
-      .def("create_splat", [](mlir::OpBuilder &self, mlir::Value &arg, std::vector<int64_t> &shape) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        auto argType = arg.getType();
-        auto ret = self.createOrFold<mlir::triton::SplatOp>(
-          loc, mlir::RankedTensorType::get(shape, argType), arg
-        );
-        return ret;
-      })
+      .def("create_reshape",
+           [](mlir::OpBuilder &self, mlir::Value &arg,
+              std::vector<int64_t> &shape) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             auto argType = arg.getType()
+                                .dyn_cast<mlir::RankedTensorType>()
+                                .getElementType();
+             return self.create<mlir::triton::ReshapeOp>(
+                 loc, mlir::RankedTensorType::get(shape, argType), arg);
+           })
+      .def("create_cat",
+           [](mlir::OpBuilder &self, mlir::Value &lhs,
+              mlir::Value &rhs) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             auto lhsType = lhs.getType().dyn_cast<mlir::RankedTensorType>();
+             auto rhsType = rhs.getType().dyn_cast<mlir::RankedTensorType>();
+             if (!(lhsType.getShape().size() == 1 &&
+                   rhsType.getShape().size() == 1))
+               throw std::runtime_error(
+                   "shape not supported by cat. Expecting rank-1 inputs");
+             std::vector<int64_t> shape{lhsType.getShape()[0] +
+                                        rhsType.getShape()[0]};
+             return self.create<mlir::triton::CatOp>(
+                 loc,
+                 mlir::RankedTensorType::get(shape, lhsType.getElementType()),
+                 lhs, rhs);
+           })
+      .def("create_broadcast",
+           [](mlir::OpBuilder &self, mlir::Value &arg,
+              std::vector<int64_t> &shape) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             if (auto argType =
+                     arg.getType().dyn_cast<mlir::RankedTensorType>())
+               return self.createOrFold<mlir::triton::BroadcastOp>(
+                   loc,
+                   mlir::RankedTensorType::get(shape, argType.getElementType()),
+                   arg);
+             throw std::runtime_error(
+                 "arg is not of RankedTensorType, use create_splat");
+           })
+      .def("create_splat",
+           [](mlir::OpBuilder &self, mlir::Value &arg,
+              std::vector<int64_t> &shape) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             auto argType = arg.getType();
+             auto ret = self.createOrFold<mlir::triton::SplatOp>(
+                 loc, mlir::RankedTensorType::get(shape, argType), arg);
+             return ret;
+           })
       // // atomic
-      .def("create_atomic_cas", [](mlir::OpBuilder &self, mlir::Value &ptr, 
-                                   mlir::Value &cmp, mlir::Value &val) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        auto ptrType = ptr.getType().dyn_cast<mlir::triton::PointerType>();
-        mlir::Type dstType = ptrType.getPointeeType();
-        return self.create<mlir::triton::AtomicCASOp>(
-          loc, dstType, ptr, cmp, val
-        );
-      })
-      .def("create_atomic_rmw", [](mlir::OpBuilder &self, mlir::triton::RMWOp rmwOp,
-                                   mlir::Value &ptr, mlir::Value &val, mlir::Value &mask) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        auto ptrType = ptr.getType().dyn_cast<mlir::triton::PointerType>();
-        mlir::Type dstType = ptrType.getPointeeType();
-        return self.create<mlir::triton::AtomicRMWOp>(
-          loc, dstType, rmwOp, ptr, val, mask
-        );
-      })
+      .def("create_atomic_cas",
+           [](mlir::OpBuilder &self, mlir::Value &ptr, mlir::Value &cmp,
+              mlir::Value &val) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             auto ptrType = ptr.getType().dyn_cast<mlir::triton::PointerType>();
+             mlir::Type dstType = ptrType.getPointeeType();
+             return self.create<mlir::triton::AtomicCASOp>(loc, dstType, ptr,
+                                                           cmp, val);
+           })
+      .def("create_atomic_rmw",
+           [](mlir::OpBuilder &self, mlir::triton::RMWOp rmwOp,
+              mlir::Value &ptr, mlir::Value &val,
+              mlir::Value &mask) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             auto ptrType = ptr.getType().dyn_cast<mlir::triton::PointerType>();
+             mlir::Type dstType = ptrType.getPointeeType();
+             return self.create<mlir::triton::AtomicRMWOp>(loc, dstType, rmwOp,
+                                                           ptr, val, mask);
+           })
 
       // Built-in instruction
-      .def("create_get_program_id", [](mlir::OpBuilder &self, int axis) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::triton::GetProgramIdOp>(
-          loc, self.getI32Type(), self.getI32IntegerAttr(axis)
-        );
-      })
-      .def("create_get_num_programs", [](mlir::OpBuilder &self, int axis) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::triton::GetNumProgramsOp>(
-          loc, self.getI32Type(), self.getI32IntegerAttr(axis)
-        );
-      })
-      .def("create_dot", [](mlir::OpBuilder &self, mlir::Value &a, mlir::Value &b, mlir::Value &c, bool allowTF32) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::triton::DotOp>(loc, c.getType(), a, b, c, allowTF32);
-      })
+      .def("create_get_program_id",
+           [](mlir::OpBuilder &self, int axis) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::triton::GetProgramIdOp>(
+                 loc, self.getI32Type(), self.getI32IntegerAttr(axis));
+           })
+      .def("create_get_num_programs",
+           [](mlir::OpBuilder &self, int axis) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::triton::GetNumProgramsOp>(
+                 loc, self.getI32Type(), self.getI32IntegerAttr(axis));
+           })
+      .def("create_dot",
+           [](mlir::OpBuilder &self, mlir::Value &a, mlir::Value &b,
+              mlir::Value &c, bool allowTF32) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::triton::DotOp>(loc, c.getType(), a, b, c,
+                                                     allowTF32);
+           })
       // .def("create_exp", &ir::builder::create_exp, ret::reference)
       // .def("create_cos", &ir::builder::create_cos, ret::reference)
       // .def("create_sin", &ir::builder::create_sin, ret::reference)
-      // .def("create_log", &ir::builder::create_log, ret::reference)      
+      // .def("create_log", &ir::builder::create_log, ret::reference)
       // .def("create_trans", &ir::builder::create_trans, ret::reference)
       // .def("create_sqrt", &ir::builder::create_sqrt, ret::reference)
-      .def("create_reduce", [](mlir::OpBuilder &self, mlir::Value &operand,
-                               mlir::triton::RedOp redOp, int axis) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        auto inputTensorType = operand.getType().dyn_cast<mlir::RankedTensorType>();
-        std::vector<int64_t> shape = inputTensorType.getShape();
-        shape.erase(shape.begin() + axis);
-        auto resType = mlir::RankedTensorType::get(shape, inputTensorType.getElementType());
-        return self.create<mlir::triton::ReduceOp>(loc, resType, redOp, operand, axis);
-      })
-      .def("create_select", [](mlir::OpBuilder &self, mlir::Value &condition,
-                               mlir::Value &trueValue, mlir::Value &falseValue) -> mlir::Value {
-        auto loc = self.getUnknownLoc();
-        return self.create<mlir::SelectOp>(loc, condition, trueValue, falseValue);
-      })
+      .def("create_reduce",
+           [](mlir::OpBuilder &self, mlir::Value &operand,
+              mlir::triton::RedOp redOp, int axis) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             auto inputTensorType =
+                 operand.getType().dyn_cast<mlir::RankedTensorType>();
+             std::vector<int64_t> shape = inputTensorType.getShape();
+             shape.erase(shape.begin() + axis);
+             auto resType = mlir::RankedTensorType::get(
+                 shape, inputTensorType.getElementType());
+             return self.create<mlir::triton::ReduceOp>(loc, resType, redOp,
+                                                        operand, axis);
+           })
+      .def("create_select",
+           [](mlir::OpBuilder &self, mlir::Value &condition,
+              mlir::Value &trueValue, mlir::Value &falseValue) -> mlir::Value {
+             auto loc = self.getUnknownLoc();
+             return self.create<mlir::SelectOp>(loc, condition, trueValue,
+                                                falseValue);
+           })
       // // Intrinsics
-      // // These have no place in the IR, and hopefully they can be removed at some point
-      // .def("create_umulhi", &ir::builder::create_umulhi, ret::reference)
-      // .def("create_barrier", &ir::builder::create_barrier, ret::reference);
+      // // These have no place in the IR, and hopefully they can be removed at
+      // some point .def("create_umulhi", &ir::builder::create_umulhi,
+      // ret::reference) .def("create_barrier", &ir::builder::create_barrier,
+      // ret::reference);
       ;
 
   py::class_<mlir::PassManager>(m, "pass_manager")
       .def(py::init<mlir::MLIRContext *>())
-      .def("run", [](mlir::PassManager &self, mlir::ModuleOp &mod) -> bool {
-        return mlir::succeeded(self.run(mod.getOperation()));
-      })
-      .def("add_sccp_pass", [](mlir::PassManager &self) {
-        self.addPass(mlir::createSCCPPass());
-      })
-      .def("add_symbol_dce_pass", [](mlir::PassManager &self) {
-        self.addPass(mlir::createSymbolDCEPass());
-      })
-      .def("add_inliner_pass", [](mlir::PassManager &self) {
-        self.addPass(mlir::createInlinerPass());
-      })
-      .def("add_canonicalizer_pass", [](mlir::PassManager &self) {
-        self.addPass(mlir::createCanonicalizerPass());
-      })
-      .def("add_cse_pass", [](mlir::PassManager &self) {
-        self.addPass(mlir::createCSEPass());
-      })
-      .def("add_triton_combine_pass", [](mlir::PassManager &self) {
-        self.addPass(mlir::triton::createCombineOpsPass());
-      })
-      .def("add_convert_triton_to_tritongpu_pass", [](mlir::PassManager &self, int numWarps) {
-        self.addPass(mlir::triton::createConvertTritonToTritonGPUPass(numWarps));
-      })
-      .def("add_tritongpu_pipeline_pass", [](mlir::PassManager &self, int numStages) {
-        self.addPass(mlir::createTritonGPUPipelinePass(numStages));
-      })
-      .def("add_triton_gpu_combine_pass", [](mlir::PassManager &self) {
-        self.addPass(mlir::createTritonGPUCombineOpsPass());
-      })
+      .def("run",
+           [](mlir::PassManager &self, mlir::ModuleOp &mod) -> bool {
+             return mlir::succeeded(self.run(mod.getOperation()));
+           })
+      .def(
+          "add_sccp_pass",
+          [](mlir::PassManager &self) { self.addPass(mlir::createSCCPPass()); })
+      .def("add_symbol_dce_pass",
+           [](mlir::PassManager &self) {
+             self.addPass(mlir::createSymbolDCEPass());
+           })
+      .def("add_inliner_pass",
+           [](mlir::PassManager &self) {
+             self.addPass(mlir::createInlinerPass());
+           })
+      .def("add_canonicalizer_pass",
+           [](mlir::PassManager &self) {
+             self.addPass(mlir::createCanonicalizerPass());
+           })
+      .def("add_cse_pass",
+           [](mlir::PassManager &self) { self.addPass(mlir::createCSEPass()); })
+      .def("add_triton_combine_pass",
+           [](mlir::PassManager &self) {
+             self.addPass(mlir::triton::createCombineOpsPass());
+           })
+      .def("add_convert_triton_to_tritongpu_pass",
+           [](mlir::PassManager &self, int numWarps) {
+             self.addPass(
+                 mlir::triton::createConvertTritonToTritonGPUPass(numWarps));
+           })
+      .def("add_tritongpu_pipeline_pass",
+           [](mlir::PassManager &self, int numStages) {
+             self.addPass(mlir::createTritonGPUPipelinePass(numStages));
+           })
+      .def("add_triton_gpu_combine_pass",
+           [](mlir::PassManager &self) {
+             self.addPass(mlir::createTritonGPUCombineOpsPass());
+           })
       .def("add_triton_gpu_verifier_pass", [](mlir::PassManager &self) {
         self.addPass(mlir::createTritonGPUVerifier());
-      })
-      ;
+      });
 }
 
 void init_triton(py::module &m) {
diff --git a/test/lib/Analysis/TestAxisInfo.cpp b/test/lib/Analysis/TestAxisInfo.cpp
index fd6493cbf..94ff92f4d 100644
--- a/test/lib/Analysis/TestAxisInfo.cpp
+++ b/test/lib/Analysis/TestAxisInfo.cpp
@@ -1,48 +1,51 @@
-#include "triton/Analysis/AxisInfo.h"
 #include "mlir/Pass/Pass.h"
+#include "triton/Analysis/AxisInfo.h"
 
 using namespace mlir;
 
-namespace{
+namespace {
 
 struct TestAxisInfoPass
-    : public PassWrapper<TestAxisInfoPass,  OperationPass<FuncOp>>{
-      
+    : public PassWrapper<TestAxisInfoPass, OperationPass<FuncOp>> {
+
   // LLVM15+
   // MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAlignmentPass);
-  
-  void print(const std::string& name, raw_ostream& os, ArrayRef<int> vals){
+
+  void print(const std::string &name, raw_ostream &os, ArrayRef<int> vals) {
     os << name << ": [";
-    for(size_t d = 0; d < vals.size(); d++){
-      if(d != 0) os << ", ";
+    for (size_t d = 0; d < vals.size(); d++) {
+      if (d != 0)
+        os << ", ";
       os << vals[d];
     }
     os << "]";
   }
 
   StringRef getArgument() const final { return "test-print-alignment"; }
-  StringRef getDescription() const final 
-  { return "print the result of the alignment analysis pass"; }
+  StringRef getDescription() const final {
+    return "print the result of the alignment analysis pass";
+  }
 
   void runOnOperation() override {
-    Operation* operation = getOperation();
-    auto& os = llvm::errs();
+    Operation *operation = getOperation();
+    auto &os = llvm::errs();
     os << "Testing: " << operation->getName() << "\n";
     AxisInfoAnalysis analysis(&getContext());
     analysis.run(operation);
-    operation->walk([&](Operation* op){
-      if(op->getNumResults() < 1)
+    operation->walk([&](Operation *op) {
+      if (op->getNumResults() < 1)
         return;
-      for(Value result: op->getResults()){
+      for (Value result : op->getResults()) {
         // std::ostringstream oss;
         // result.print(oss);
         // os << " => ";
-        LatticeElement<AxisInfo> *latticeElement = analysis.lookupLatticeElement(result);
-        if(!latticeElement){
+        LatticeElement<AxisInfo> *latticeElement =
+            analysis.lookupLatticeElement(result);
+        if (!latticeElement) {
           os << "None\n";
           return;
         }
-        AxisInfo& info = latticeElement->getValue();
+        AxisInfo &info = latticeElement->getValue();
         print("Contiguity", os, info.getContiguity());
         os << " ; ";
         print("Divisibility", os, info.getDivisibility());
@@ -50,18 +53,17 @@ struct TestAxisInfoPass
         print("Constancy", os, info.getConstancy());
         os << " ( ";
         result.print(os);
-        os << " ) "; 
+        os << " ) ";
         os << "\n";
       }
     });
   }
 };
 
-}
+} // namespace
 
-namespace mlir{
-namespace test{
+namespace mlir {
+namespace test {
 void registerTestAlignmentPass() { PassRegistration<TestAxisInfoPass>(); }
-}
-}
-
+} // namespace test
+} // namespace mlir